diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml index 91663946de460c40f126184b34a3d5cea2a57daa..983c3c132e80561104ee56eb1695e9c15c893306 100644 --- a/.gitlab-ci.d/buildtest.yml +++ b/.gitlab-ci.d/buildtest.yml @@ -579,6 +579,9 @@ build-tci: - make check-tcg # Check our reduced build configurations +# requires libfdt: aarch64, arm, i386, loongarch64, microblaze, microblazeel, +# mips64el, or1k, ppc, ppc64, riscv32, riscv64, rx, x86_64 +# does not build without boards: i386, s390x, sh4, sh4eb, x86_64 build-without-defaults: extends: .native_build_job_template needs: diff --git a/.travis.yml b/.travis.yml index 76859d48da5a1bff564111b44d73cd1d9f8695f9..597d151b802d7b8e31998e986fab8aef00baef3c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ env: - TEST_BUILD_CMD="" - TEST_CMD="make check V=1" # This is broadly a list of "mainline" system targets which have support across the major distros - - MAIN_SOFTMMU_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu" + - MAIN_SYSTEM_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu" - CCACHE_SLOPPINESS="include_file_ctime,include_file_mtime" - CCACHE_MAXSIZE=1G - G_MESSAGES_DEBUG=error @@ -114,7 +114,7 @@ jobs: env: - TEST_CMD="make check check-tcg V=1" - CONFIG="--disable-containers --enable-fdt=system - --target-list=${MAIN_SOFTMMU_TARGETS} --cxx=/bin/false" + --target-list=${MAIN_SYSTEM_TARGETS} --cxx=/bin/false" - UNRELIABLE=true - name: "[ppc64] GCC check-tcg" @@ -185,7 +185,7 @@ jobs: env: - TEST_CMD="make check check-tcg V=1" - CONFIG="--disable-containers --enable-fdt=system - --target-list=${MAIN_SOFTMMU_TARGETS},s390x-linux-user" + --target-list=${MAIN_SYSTEM_TARGETS},s390x-linux-user" - UNRELIABLE=true script: - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$? @@ -226,7 +226,7 @@ jobs: - genisoimage env: - CONFIG="--disable-containers --enable-fdt=system --audio-drv-list=sdl - --disable-user --target-list-exclude=${MAIN_SOFTMMU_TARGETS}" + --disable-user --target-list-exclude=${MAIN_SYSTEM_TARGETS}" - name: "[s390x] GCC (user)" arch: s390x diff --git a/Kconfig.host b/Kconfig.host index f496475f8e2ef89533c1e5f97e42fc8f3e74ac5e..faf58d9af513fa8f18882bf8ab0b04bd6fc67b57 100644 --- a/Kconfig.host +++ b/Kconfig.host @@ -28,6 +28,7 @@ config VHOST_USER config VHOST_VDPA bool + select IOMMUFD config VHOST_KERNEL bool diff --git a/MAINTAINERS b/MAINTAINERS index 695e0bd34fbba253d77570e5b3ef8dabe7a174b3..ada87bfa9e063798cf366992aacc09bfdcfd0230 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2167,6 +2167,19 @@ F: hw/vfio/ap.c F: docs/system/s390x/vfio-ap.rst L: qemu-s390x@nongnu.org +iommufd +M: Yi Liu +M: Eric Auger +M: Zhenzhong Duan +S: Supported +F: backends/iommufd.c +F: include/sysemu/iommufd.h +F: backends/host_iommu_device.c +F: include/sysemu/host_iommu_device.h +F: include/qemu/chardev_open.h +F: util/chardev_open.c +F: docs/devel/vfio-iommufd.rst + vhost M: Michael S. Tsirkin S: Supported diff --git a/accel/kvm/kvm-accel-ops.c b/accel/kvm/kvm-accel-ops.c index 6195150a0b4d2f310614d4efa18d409dd0c28044..54f19028b828d8e329160ec26359c618f1f0b317 100644 --- a/accel/kvm/kvm-accel-ops.c +++ b/accel/kvm/kvm-accel-ops.c @@ -112,6 +112,9 @@ static void kvm_accel_ops_class_init(ObjectClass *oc, void *data) ops->remove_breakpoint = kvm_remove_breakpoint; ops->remove_all_breakpoints = kvm_remove_all_breakpoints; #endif + + ops->control_pre_system_reset = kvm_cpus_control_pre_system_reset; + ops->control_post_system_reset = kvm_cpus_control_post_system_reset; } static const TypeInfo kvm_accel_ops_type = { diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e39a810a4e92333a251711a223eb57b05dc2033d..7d175d3262fe0b78097b526d11660053013af932 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -52,6 +52,8 @@ #include "hw/boards.h" #include "sysemu/stats.h" +#include "sysemu/kvm.h" + /* This check must be after config-host.h is included */ #ifdef CONFIG_EVENTFD #include @@ -86,6 +88,9 @@ struct KVMParkedVcpu { }; KVMState *kvm_state; + +bool virtcca_cvm_allowed = false; + bool kvm_kernel_irqchip; bool kvm_split_irqchip; bool kvm_async_interrupts_allowed; @@ -98,6 +103,7 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_vm_attributes_allowed; bool kvm_msi_use_devid; +bool kvm_csv3_allowed; bool kvm_has_guest_debug; static int kvm_sstep_flags; static bool kvm_immediate_exit; @@ -324,14 +330,72 @@ err: return ret; } +void kvm_park_vcpu(CPUState *cpu) +{ + struct KVMParkedVcpu *vcpu; + + trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); + + vcpu = g_malloc0(sizeof(*vcpu)); + vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); + vcpu->kvm_fd = cpu->kvm_fd; + QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); +} + +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) +{ + struct KVMParkedVcpu *cpu; + int kvm_fd = -ENOENT; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + if (cpu->vcpu_id == vcpu_id) { + QLIST_REMOVE(cpu, node); + kvm_fd = cpu->kvm_fd; + g_free(cpu); + break; + } + } + + trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked"); + + return kvm_fd; +} + +int kvm_create_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = kvm_arch_vcpu_id(cpu); + KVMState *s = kvm_state; + int kvm_fd; + + /* check if the KVM vCPU already exist but is parked */ + kvm_fd = kvm_unpark_vcpu(s, vcpu_id); + if (kvm_fd < 0) { + /* vCPU not parked: create a new KVM vCPU */ + kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id); + if (kvm_fd < 0) { + error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id); + return kvm_fd; + } + } + + cpu->kvm_fd = kvm_fd; + cpu->kvm_state = s; + cpu->vcpu_dirty = true; + cpu->dirty_pages = 0; + cpu->throttle_us_per_full = 0; + + trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd); + + return 0; +} + static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; long mmap_size; - struct KVMParkedVcpu *vcpu = NULL; int ret = 0; - DPRINTF("kvm_destroy_vcpu\n"); + trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); ret = kvm_arch_destroy_vcpu(cpu); if (ret < 0) { @@ -357,10 +421,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) } } - vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); - vcpu->kvm_fd = cpu->kvm_fd; - QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); + kvm_park_vcpu(cpu); err: return ret; } @@ -373,24 +434,6 @@ void kvm_destroy_vcpu(CPUState *cpu) } } -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) -{ - struct KVMParkedVcpu *cpu; - - QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { - if (cpu->vcpu_id == vcpu_id) { - int kvm_fd; - - QLIST_REMOVE(cpu, node); - kvm_fd = cpu->kvm_fd; - g_free(cpu); - return kvm_fd; - } - } - - return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); -} - int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; @@ -399,19 +442,14 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + ret = kvm_create_vcpu(cpu); if (ret < 0) { - error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", + error_setg_errno(errp, -ret, + "kvm_init_vcpu: kvm_create_vcpu failed (%lu)", kvm_arch_vcpu_id(cpu)); goto err; } - cpu->kvm_fd = ret; - cpu->kvm_state = s; - cpu->vcpu_dirty = true; - cpu->dirty_pages = 0; - cpu->throttle_us_per_full = 0; - mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { ret = mmap_size; @@ -1699,6 +1737,36 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener, } } +static int kvm_ioeventfd_batch(bool start) +{ + int ret; + struct kvm_ioeventfd iofd = { + .flags = start ? + KVM_IOEVENTFD_FLAG_BATCH_BEGIN : KVM_IOEVENTFD_FLAG_BATCH_END, + }; + + if (!kvm_enabled()) { + return -ENOSYS; + } + + ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); + if (ret < 0) { + return -errno; + } + + return 0; +} + +static void kvm_ioeventfd_begin(MemoryListener *listener) +{ + kvm_ioeventfd_batch(true); +} + +static void kvm_ioeventfd_end(MemoryListener *listener) +{ + kvm_ioeventfd_batch(false); +} + static void kvm_io_ioeventfd_del(MemoryListener *listener, MemoryRegionSection *section, bool match_data, uint64_t data, @@ -1834,7 +1902,10 @@ void kvm_irqchip_commit_routes(KVMState *s) s->irq_routes->flags = 0; trace_kvm_irqchip_commit_routes(); ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); - assert(ret == 0); + if (ret < 0) { + error_report("Set GSI routing failed: %m"); + abort(); + } } static void kvm_add_routing_entry(KVMState *s, @@ -1861,10 +1932,11 @@ static void kvm_add_routing_entry(KVMState *s, set_gsi(s, entry->gsi); } -static int kvm_update_routing_entry(KVMState *s, +static int kvm_update_routing_entry(KVMRouteChange *c, struct kvm_irq_routing_entry *new_entry) { struct kvm_irq_routing_entry *entry; + KVMState *s = c->s; int n; for (n = 0; n < s->irq_routes->nr; n++) { @@ -1878,7 +1950,7 @@ static int kvm_update_routing_entry(KVMState *s, } *entry = *new_entry; - + c->changes++; return 0; } @@ -2010,7 +2082,7 @@ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) return virq; } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev) { struct kvm_irq_routing_entry kroute = {}; @@ -2033,13 +2105,14 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, kroute.flags = KVM_MSI_VALID_DEVID; kroute.u.msi.devid = pci_requester_id(dev); } - if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { + if (msg.address && + kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { return -EINVAL; } trace_kvm_irqchip_update_msi_route(virq); - return kvm_update_routing_entry(s, &kroute); + return kvm_update_routing_entry(c, &kroute); } static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, @@ -2181,7 +2254,7 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, abort(); } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg) { return -ENOSYS; } @@ -2308,7 +2381,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id) bool kvm_dirty_ring_enabled(void) { - return kvm_state->kvm_dirty_ring_size ? true : false; + return kvm_state && kvm_state->kvm_dirty_ring_size; } static void query_stats_cb(StatsResultList **result, StatsTarget target, @@ -2320,6 +2393,11 @@ uint32_t kvm_dirty_ring_size(void) return kvm_state->kvm_dirty_ring_size; } +static inline bool kvm_is_virtcca_cvm_type(int type) +{ + return type & VIRTCCA_CVM_TYPE; +} + static int kvm_init(MachineState *ms) { MachineClass *mc = MACHINE_GET_CLASS(ms); @@ -2344,6 +2422,7 @@ static int kvm_init(MachineState *ms) qemu_mutex_init(&kml_slots_lock); s = KVM_STATE(ms->accelerator); + kvm_state = s; /* * On systems where the kernel can support different base page @@ -2412,6 +2491,10 @@ static int kvm_init(MachineState *ms) goto err; } + if (kvm_is_virtcca_cvm_type(type)) { + virtcca_cvm_allowed = true; + } + do { ret = kvm_ioctl(s, KVM_CREATE_VM, type); } while (ret == -EINTR); @@ -2559,8 +2642,6 @@ static int kvm_init(MachineState *ms) #endif } - kvm_state = s; - ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; @@ -2580,6 +2661,8 @@ static int kvm_init(MachineState *ms) s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; + s->memory_listener.listener.eventfd_begin = kvm_ioeventfd_begin; + s->memory_listener.listener.eventfd_end = kvm_ioeventfd_end; kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0, "kvm-memory"); @@ -2761,6 +2844,16 @@ void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); } +void kvm_cpus_control_pre_system_reset(void) +{ + kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_PRE_SYSTEM_RESET, NULL); +} + +void kvm_cpus_control_post_system_reset(void) +{ + kvm_vm_ioctl(kvm_state, KVM_CONTROL_VCPU_POST_SYSTEM_RESET, NULL); +} + #ifdef KVM_HAVE_MCE_INJECTION static __thread void *pending_sigbus_addr; static __thread int pending_sigbus_code; @@ -2993,7 +3086,7 @@ int kvm_cpu_exec(CPUState *cpu) if (ret < 0) { cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); - vm_stop(RUN_STATE_INTERNAL_ERROR); + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); } qatomic_set(&cpu->exit_request, 0); @@ -3158,6 +3251,31 @@ bool kvm_arm_supports_user_irq(void) return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); } +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) +{ + KVMState *s = kvm_state; + int size, ret; + + if (s == NULL || !kvm_check_extension(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK)) { + return; + } + + size = hdbss_buffer_size; + if (size < 0 || size > MAX_HDBSS_BUFFER_SIZE) { + fprintf(stderr, "Invalid hdbss buffer size: %d\n", size); + return; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HW_DIRTY_STATE_TRACK, 0, + enable ? size : 0); + if (ret) { + fprintf(stderr, "Could not %s KVM_CAP_ARM_HW_DIRTY_STATE_TRACK: %d\n", + enable ? "enable" : "disable", ret); + } + + return; +} + #ifdef KVM_CAP_SET_GUEST_DEBUG struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc) { @@ -3468,6 +3586,28 @@ int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) return r; } +int kvm_load_user_data(hwaddr loader_start, hwaddr dtb_info, hwaddr data_start, hwaddr data_size, hwaddr ram_size, + struct kvm_numa_info *numa_info) +{ + KVMState *state = kvm_state; + struct kvm_user_data data; + int ret; + + data.loader_start = loader_start; + data.dtb_info = dtb_info; + data.data_start = data_start; + data.data_size = data_size; + data.ram_size = ram_size; + memcpy(&data.numa_info, numa_info, sizeof(struct kvm_numa_info)); + + ret = kvm_vm_ioctl(state, KVM_LOAD_USER_DATA, &data); + if (ret < 0) { + error_report("%s: KVM_LOAD_USER_DATA failed!\n", __func__); + } + + return ret; +} + static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, hwaddr start_addr, hwaddr size) { @@ -3568,6 +3708,11 @@ bool kvm_kernel_irqchip_split(void) return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; } +bool kvm_smccc_filter_enabled(void) +{ + return kvm_state->kvm_smccc_filter_enabled; +} + static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -3613,6 +3758,7 @@ static void kvm_accel_instance_init(Object *obj) /* KVM dirty ring is by default off */ s->kvm_dirty_ring_size = 0; s->kvm_dirty_ring_with_bitmap = false; + s->kvm_smccc_filter_enabled = false; s->kvm_eager_split_size = 0; s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN; s->notify_window = 0; diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h index ca40add32c3dd90f74ed692c88796323e6b08584..27b9d0d9dbfc83bb5060e48795ae6b858030c061 100644 --- a/accel/kvm/kvm-cpus.h +++ b/accel/kvm/kvm-cpus.h @@ -23,4 +23,7 @@ int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); void kvm_remove_all_breakpoints(CPUState *cpu); +void kvm_cpus_control_pre_system_reset(void); +void kvm_cpus_control_post_system_reset(void); + #endif /* KVM_CPUS_H */ diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 399aaeb0ec757cf50ba44943d4ab4d30e02c1a0d..9c880fdcf4f67b985dfb94cea39428a291c9d7e3 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -9,6 +9,10 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d" +kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s" kvm_irqchip_commit_routes(void) "" kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" @@ -26,3 +30,10 @@ kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"P kvm_dirty_ring_reaper_kick(const char *reason) "%s" kvm_dirty_ring_flush(int finished) "%d" +kvm_failed_get_vcpu_mmap_size(void) "" +kvm_cpu_exec(void) "" +kvm_interrupt_exit_request(void) "" +kvm_io_window_exit(void) "" +kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32 +kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" +kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64 diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 1b37d9a302cccce5f7636bff492cdd2401996d48..e68f3433ad2c816b449eb015be3134f0124d59a5 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -25,6 +25,10 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_msi_use_devid; +bool virtcca_cvm_allowed; + +bool kvm_csv3_allowed; + void kvm_flush_coalesced_mmio_buffer(void) { } @@ -61,7 +65,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq) { } -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev) { return -ENOSYS; @@ -115,6 +119,11 @@ bool kvm_arm_supports_user_irq(void) return false; } +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size) +{ + g_assert_not_reached(); +} + bool kvm_dirty_ring_enabled(void) { return false; @@ -124,3 +133,9 @@ uint32_t kvm_dirty_ring_size(void) { return 0; } + +int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, + struct kvm_numa_info *numa_info) +{ + return -ENOSYS; +} diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c index 3d2a896220cc4ad9646af55f1cf850273062e6db..eb37f9e8a89c20b77bb86a350754562049cb1bee 100644 --- a/accel/tcg/tb-maint.c +++ b/accel/tcg/tb-maint.c @@ -712,7 +712,7 @@ static void tb_record(TranslationBlock *tb) tb_page_addr_t paddr0 = tb_page_addr0(tb); tb_page_addr_t paddr1 = tb_page_addr1(tb); tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS; - tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS; + tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS; assert(paddr0 != -1); if (unlikely(paddr1 != -1) && pindex0 != pindex1) { @@ -744,7 +744,7 @@ static void tb_remove(TranslationBlock *tb) tb_page_addr_t paddr0 = tb_page_addr0(tb); tb_page_addr_t paddr1 = tb_page_addr1(tb); tb_page_addr_t pindex0 = paddr0 >> TARGET_PAGE_BITS; - tb_page_addr_t pindex1 = paddr0 >> TARGET_PAGE_BITS; + tb_page_addr_t pindex1 = paddr1 >> TARGET_PAGE_BITS; assert(paddr0 != -1); if (unlikely(paddr1 != -1) && pindex0 != pindex1) { diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c index fac80095bbd09944870a29312de0a906c63ca3c0..73866990ced13943d8f7d71aff71ae71ec855f04 100644 --- a/accel/tcg/tcg-accel-ops-mttcg.c +++ b/accel/tcg/tcg-accel-ops-mttcg.c @@ -122,6 +122,7 @@ static void *mttcg_cpu_thread_fn(void *arg) qemu_mutex_unlock_iothread(); rcu_remove_force_rcu_notifier(&force_rcu.notifier); rcu_unregister_thread(); + tcg_unregister_thread(); return NULL; } diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index 68b252cb8e8d55a17ce0fe4c1db44fa89012747c..e87848a5e25abac2d6cb870b2516683fa9bda364 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -794,7 +794,7 @@ static int probe_access_internal(CPUArchState *env, vaddr addr, if (guest_addr_valid_untagged(addr)) { int page_flags = page_get_flags(addr); if (page_flags & acc_flag) { - if ((acc_flag == PAGE_READ || acc_flag == PAGE_WRITE) + if (access_type != MMU_INST_FETCH && cpu_plugin_mem_cbs_enabled(env_cpu(env))) { return TLB_MMIO; } diff --git a/audio/audio.c b/audio/audio.c index 8d1e4ad92271b39893c4e6454cce4d66d36458fa..7ac74f9e16e0fc5b6d49aae3569152c3c0ad753c 100644 --- a/audio/audio.c +++ b/audio/audio.c @@ -1744,7 +1744,7 @@ static AudioState *audio_init(Audiodev *dev, Error **errp) if (driver) { done = !audio_driver_init(s, driver, dev, errp); } else { - error_setg(errp, "Unknown audio driver `%s'\n", drvname); + error_setg(errp, "Unknown audio driver `%s'", drvname); } if (!done) { goto out; diff --git a/audio/pwaudio.c b/audio/pwaudio.c index 3ce5f6507b47270c1283cabf5b47ad81932d6e5e..5d1c7126d33e8e2a362ec80f9c930a854603f9c2 100644 --- a/audio/pwaudio.c +++ b/audio/pwaudio.c @@ -770,13 +770,15 @@ qpw_audio_init(Audiodev *dev, Error **errp) pw->core = pw_context_connect(pw->context, NULL, 0); if (pw->core == NULL) { pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; + error_setg_errno(errp, errno, "Failed to connect to PipeWire instance"); + goto fail; } if (pw_core_add_listener(pw->core, &pw->core_listener, &core_events, pw) < 0) { pw_thread_loop_unlock(pw->thread_loop); - goto fail_error; + error_setg(errp, "Failed to add PipeWire listener"); + goto fail; } if (wait_resync(pw) < 0) { pw_thread_loop_unlock(pw->thread_loop); @@ -786,8 +788,6 @@ qpw_audio_init(Audiodev *dev, Error **errp) return g_steal_pointer(&pw); -fail_error: - error_setg(errp, "Failed to initialize PW context"); fail: if (pw->thread_loop) { pw_thread_loop_stop(pw->thread_loop); diff --git a/backends/Kconfig b/backends/Kconfig index f35abc16092808b1fe5b033a346908e2d66bff0b..8d0be5a263ff1ca6231a766990b1b89068d06a2e 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -1 +1,4 @@ source tpm/Kconfig + +config IOMMUFD + bool diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c index 39d04552807fc15fce884c2925ed57af17877bef..940104ee55222337b577ee23944a63c3c25d46bf 100644 --- a/backends/cryptodev-builtin.c +++ b/backends/cryptodev-builtin.c @@ -23,6 +23,7 @@ #include "qemu/osdep.h" #include "sysemu/cryptodev.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "standard-headers/linux/virtio_crypto.h" #include "crypto/cipher.h" @@ -396,8 +397,8 @@ static int cryptodev_builtin_create_session( case VIRTIO_CRYPTO_HASH_CREATE_SESSION: case VIRTIO_CRYPTO_MAC_CREATE_SESSION: default: - error_setg(&local_error, "Unsupported opcode :%" PRIu32 "", - sess_info->op_code); + error_report("Unsupported opcode :%" PRIu32 "", + sess_info->op_code); return -VIRTIO_CRYPTO_NOTSUPP; } @@ -427,7 +428,9 @@ static int cryptodev_builtin_close_session( CRYPTODEV_BACKEND_BUILTIN(backend); CryptoDevBackendBuiltinSession *session; - assert(session_id < MAX_NUM_SESSIONS && builtin->sessions[session_id]); + if (session_id >= MAX_NUM_SESSIONS || !builtin->sessions[session_id]) { + return -VIRTIO_CRYPTO_INVSESS; + } session = builtin->sessions[session_id]; if (session->cipher) { @@ -552,8 +555,8 @@ static int cryptodev_builtin_operation( if (op_info->session_id >= MAX_NUM_SESSIONS || builtin->sessions[op_info->session_id] == NULL) { - error_setg(&local_error, "Cannot find a valid session id: %" PRIu64 "", - op_info->session_id); + error_report("Cannot find a valid session id: %" PRIu64 "", + op_info->session_id); return -VIRTIO_CRYPTO_INVSESS; } diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c index 45aba1ff67b895a9c22658e7fd9577cfa5bb6264..45b287a953542936b7b81fa0b91d80e0b4fc03cc 100644 --- a/backends/cryptodev-lkcf.c +++ b/backends/cryptodev-lkcf.c @@ -330,6 +330,8 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) cryptodev_lkcf_set_op_desc(&session->akcipher_opts, op_desc, sizeof(op_desc), &local_error) != 0) { error_report_err(local_error); + status = -VIRTIO_CRYPTO_ERR; + goto out; } else { key_id = add_key(KCTL_KEY_TYPE_PKEY, "lkcf-backend-priv-key", p8info, p8info_len, KCTL_KEY_RING); @@ -346,6 +348,7 @@ static void cryptodev_lkcf_execute_task(CryptoDevLKCFTask *task) session->key, session->keylen, &local_error); if (!akcipher) { + error_report_err(local_error); status = -VIRTIO_CRYPTO_ERR; goto out; } diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c index c3283ba84a55e53e3f57d22093c011c588d1d85b..b8e95ca8b42c53f5353ffa0b237dc8fbf57011aa 100644 --- a/backends/cryptodev-vhost-user.c +++ b/backends/cryptodev-vhost-user.c @@ -281,8 +281,7 @@ static int cryptodev_vhost_user_create_session( break; default: - error_setg(&local_error, "Unsupported opcode :%" PRIu32 "", - sess_info->op_code); + error_report("Unsupported opcode :%" PRIu32 "", sess_info->op_code); return -VIRTIO_CRYPTO_NOTSUPP; } diff --git a/backends/cryptodev.c b/backends/cryptodev.c index e5006bd215c8172c4bdba7e4aaa93361a7e04124..fff89fd62ae1354458bea517a000cca615068b74 100644 --- a/backends/cryptodev.c +++ b/backends/cryptodev.c @@ -398,6 +398,7 @@ static void cryptodev_backend_set_ops(Object *obj, Visitor *v, static void cryptodev_backend_complete(UserCreatable *uc, Error **errp) { + ERRP_GUARD(); CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc); CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc); uint32_t services; @@ -406,11 +407,20 @@ cryptodev_backend_complete(UserCreatable *uc, Error **errp) QTAILQ_INIT(&backend->opinfos); value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg; cryptodev_backend_set_throttle(backend, THROTTLE_OPS_TOTAL, value, errp); + if (*errp) { + return; + } value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg; cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp); + if (*errp) { + return; + } if (bc->init) { bc->init(backend, errp); + if (*errp) { + return; + } } services = backend->conf.crypto_services; diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c new file mode 100644 index 0000000000000000000000000000000000000000..8f2dda1beb9bbea27a61c17d439aeb19ec26cc90 --- /dev/null +++ b/backends/host_iommu_device.c @@ -0,0 +1,33 @@ +/* + * Host IOMMU device abstract + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/host_iommu_device.h" + +OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice, + host_iommu_device, + HOST_IOMMU_DEVICE, + OBJECT) + +static void host_iommu_device_class_init(ObjectClass *oc, void *data) +{ +} + +static void host_iommu_device_init(Object *obj) +{ +} + +static void host_iommu_device_finalize(Object *obj) +{ + HostIOMMUDevice *hiod = HOST_IOMMU_DEVICE(obj); + + g_free(hiod->name); +} diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 361d4a8103ef82cba0994b24bc989a65798741ef..ce63a372a3fce4733c1adb02f7940766ba1ba2d5 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -20,9 +20,14 @@ #include "qom/object.h" #include "qapi/visitor.h" #include "qapi/qapi-visit-common.h" +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE) +bool virtcca_shared_hugepage_mapped = false; +uint64_t virtcca_cvm_ram_size = 0; +uint64_t virtcca_cvm_gpa_start = 0; struct HostMemoryBackendFile { HostMemoryBackend parent_obj; @@ -36,6 +41,91 @@ struct HostMemoryBackendFile { OnOffAuto rom; }; +/* Parse the path of the hugepages memory file used for memory sharing */ +static int virtcca_parse_share_mem_path(char *src, char *dst) +{ + int ret = 0; + char src_copy[PATH_MAX]; + char *token = NULL; + char *last_dir = NULL; + char *second_last_dir = NULL; + static const char delimiter[] = "/"; + + if (src == NULL || dst == NULL || + strlen(src) == 0 || strlen(src) > PATH_MAX - 1) { + error_report("Invalid input: NULL pointer or invalid string length."); + return -1; + } + + strcpy(src_copy, src); + token = strtok(src_copy, delimiter); + + /* Iterate over the path segments to find the second-to-last directory */ + while (token != NULL) { + second_last_dir = last_dir; + last_dir = token; + token = strtok(NULL, delimiter); + } + + /* Check if the second-to-last directory is found */ + if (second_last_dir == NULL) { + error_report("Invalid path: second-to-last directory not found."); + return -1; + } + + /* + * Construct the share memory path by appending the extracted domain name + * to the hugepages memory filesystem prefix + */ + ret = snprintf(dst, PATH_MAX, "/dev/hugepages/libvirt/qemu/%s", + second_last_dir); + + if (ret < 0 || ret >= PATH_MAX) { + error_report("Error: snprintf failed to construct the share mem path"); + return -1; + } + + return 0; +} + +/* + * Create a hugepage memory region in the virtcca scenario + * for sharing with process like vhost-user and others. + */ +static void +virtcca_shared_backend_memory_alloc(char *mem_path, uint32_t ram_flags, Error **errp) +{ + char dst[PATH_MAX]; + uint64_t size = virtcca_cvm_ram_size; + + if (virtcca_parse_share_mem_path(mem_path, dst)) { + error_report("parse virtcca share memory path failed"); + exit(1); + } + + /* + * 1) CVM_GPA_START = 3GB --> fix size = 1GB + * 2) CVM_GPA_START = 1GB && ram_size >= 3GB --> size = 3GB + * 3) CVM_GPA_START = 1GB && ram_size < 3GB --> size = ram_size + */ + if (virtcca_cvm_gpa_start != DEFAULT_VM_GPA_START) { + size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - virtcca_cvm_gpa_start; + } else if (virtcca_cvm_ram_size >= VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START) { + size = VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT - DEFAULT_VM_GPA_START; + } + + virtcca_shared_hugepage = g_new(MemoryRegion, 1); + memory_region_init_ram_from_file(virtcca_shared_hugepage, NULL, + "virtcca_shared_hugepage", size, + VIRTCCA_SHARED_HUGEPAGE_ALIGN, + ram_flags, dst, 0, errp); + if (*errp) { + error_reportf_err(*errp, "cannot init RamBlock for virtcca_shared_hugepage: "); + exit(1); + } + virtcca_shared_hugepage_mapped = true; +} + static void file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { @@ -90,6 +180,12 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) backend->size, fb->align, ram_flags, fb->mem_path, fb->offset, errp); g_free(name); + + if (virtcca_cvm_enabled() && backend->share && + (strcmp(fb->mem_path, "/dev/shm") != 0) && + !virtcca_shared_hugepage_mapped) { + virtcca_shared_backend_memory_alloc(fb->mem_path, ram_flags, errp); + } #endif } diff --git a/backends/iommufd.c b/backends/iommufd.c new file mode 100644 index 0000000000000000000000000000000000000000..4446efaa32a2644ffc49fbd1632d1ef8dd7ed3d4 --- /dev/null +++ b/backends/iommufd.c @@ -0,0 +1,532 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "sysemu/iommufd.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qemu/module.h" +#include "qom/object_interfaces.h" +#include "qemu/error-report.h" +#include "monitor/monitor.h" +#include "trace.h" +#include "hw/vfio/vfio-common.h" +#include +#include + +static void iommufd_backend_init(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + be->fd = -1; + be->users = 0; + be->owned = true; +} + +static void iommufd_backend_finalize(Object *obj) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + + if (be->owned) { + close(be->fd); + be->fd = -1; + } +} + +static void iommufd_backend_set_fd(Object *obj, const char *str, Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *be = IOMMUFD_BACKEND(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + be->fd = fd; + be->owned = false; + trace_iommu_backend_set_fd(be->fd); +} + +static bool iommufd_backend_can_be_deleted(UserCreatable *uc) +{ + IOMMUFDBackend *be = IOMMUFD_BACKEND(uc); + + return !be->users; +} + +static void iommufd_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->can_be_deleted = iommufd_backend_can_be_deleted; + + object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd); +} + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp) +{ + int fd, ret = 0; + + if (be->owned && !be->users) { + fd = qemu_open("/dev/iommu", O_RDWR, errp); + if (fd < 0) { + ret = fd; + goto out; + } + be->fd = fd; + } + be->users++; +out: + trace_iommufd_backend_connect(be->fd, be->owned, + be->users, ret); + return ret; +} + +void iommufd_backend_disconnect(IOMMUFDBackend *be) +{ + if (!be->users) { + goto out; + } + be->users--; + if (!be->users && be->owned) { + close(be->fd); + be->fd = -1; + } +out: + trace_iommufd_backend_disconnect(be->fd, be->users); +} + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp) +{ + int ret, fd = be->fd; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + + ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate ioas"); + return ret; + } + + *ioas_id = alloc_data.out_ioas_id; + trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret); + + return ret; +} + +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id) +{ + int ret, fd = be->fd; + struct iommu_destroy des = { + .size = sizeof(des), + .id = id, + }; + + ret = ioctl(fd, IOMMU_DESTROY, &des); + trace_iommufd_backend_free_id(fd, id, ret); + if (ret) { + error_report("Failed to free id: %u %m", id); + } +} + +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + int ret, fd = be->fd; + struct iommu_ioas_map map = { + .size = sizeof(map), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .ioas_id = ioas_id, + .__reserved = 0, + .user_va = (uintptr_t)vaddr, + .iova = iova, + .length = size, + }; + + if (!readonly) { + map.flags |= IOMMU_IOAS_MAP_WRITEABLE; + } + + ret = ioctl(fd, IOMMU_IOAS_MAP, &map); + trace_iommufd_backend_map_dma(fd, ioas_id, iova, size, + vaddr, readonly, ret); + if (ret) { + ret = -errno; + + /* TODO: Not support mapping hardware PCI BAR region for now. */ + if (errno == EFAULT) { + warn_report("IOMMU_IOAS_MAP failed: %m, PCI BAR?"); + } else { + error_report("IOMMU_IOAS_MAP failed: %m"); + } + } + return ret; +} + +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size) +{ + int ret, fd = be->fd; + struct iommu_ioas_unmap unmap = { + .size = sizeof(unmap), + .ioas_id = ioas_id, + .iova = iova, + .length = size, + }; + + ret = ioctl(fd, IOMMU_IOAS_UNMAP, &unmap); + /* + * IOMMUFD takes mapping as some kind of object, unmapping + * nonexistent mapping is treated as deleting a nonexistent + * object and return ENOENT. This is different from legacy + * backend which allows it. vIOMMU may trigger a lot of + * redundant unmapping, to avoid flush the log, treat them + * as succeess for IOMMUFD just like legacy backend. + */ + if (ret && errno == ENOENT) { + trace_iommufd_backend_unmap_dma_non_exist(fd, ioas_id, iova, size, ret); + ret = 0; + } else { + trace_iommufd_backend_unmap_dma(fd, ioas_id, iova, size, ret); + } + + if (ret) { + ret = -errno; + error_report("IOMMU_IOAS_UNMAP failed: %m"); + } + return ret; +} + +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + uint32_t *out_fault_fd, Error **errp) +{ + int ret, fd = be->fd; + struct iommu_hwpt_alloc alloc_hwpt = { + .size = sizeof(struct iommu_hwpt_alloc), + .flags = flags, + .dev_id = dev_id, + .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uintptr_t)data_ptr, + }; + + if (flags & IOMMU_HWPT_FAULT_ID_VALID) { + + struct iommu_fault_alloc cmd = { + .size = sizeof(cmd), + }; + + ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd); + if (ret) { + ret = -errno; + error_report("IOMMU_FAULT_ALLOC failed: %m"); + } else { + alloc_hwpt.fault_id = cmd.out_fault_id; + if (out_fault_fd) { + *out_fault_fd = cmd.out_fault_fd; + } + } + } + + ret = ioctl(fd, IOMMU_HWPT_ALLOC, &alloc_hwpt); + trace_iommufd_backend_alloc_hwpt(fd, dev_id, pt_id, flags, data_type, + data_len, (uintptr_t)data_ptr, + alloc_hwpt.out_hwpt_id, ret); + if (ret) { + error_setg_errno(errp, errno, "Failed to allocate hwpt"); + return false; + } + + *out_hwpt = alloc_hwpt.out_hwpt_id; + return true; +} + +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, + uint32_t hwpt_id, bool start, + Error **errp) +{ + int ret; + struct iommu_hwpt_set_dirty_tracking set_dirty = { + .size = sizeof(set_dirty), + .hwpt_id = hwpt_id, + .flags = start ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &set_dirty); + trace_iommufd_backend_set_dirty(be->fd, hwpt_id, start, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_SET_DIRTY_TRACKING(hwpt_id %u) failed", + hwpt_id); + return false; + } + + return true; +} + +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, + uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp) +{ + int ret; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap = { + .size = sizeof(get_dirty_bitmap), + .hwpt_id = hwpt_id, + .iova = iova, + .length = size, + .page_size = page_size, + .data = (uintptr_t)data, + }; + + ret = ioctl(be->fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &get_dirty_bitmap); + trace_iommufd_backend_get_dirty_bitmap(be->fd, hwpt_id, iova, size, + page_size, ret ? errno : 0); + if (ret) { + error_setg_errno(errp, errno, + "IOMMU_HWPT_GET_DIRTY_BITMAP (iova: 0x%"HWADDR_PRIx + " size: 0x"RAM_ADDR_FMT") failed", iova, size); + return false; + } + + return true; +} + +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp) +{ + struct iommu_hw_info info = { + .size = sizeof(info), + .dev_id = devid, + .data_len = len, + .data_uptr = (uintptr_t)data, + }; + + if (ioctl(be->fd, IOMMU_GET_HW_INFO, &info)) { + error_setg_errno(errp, errno, "Failed to get hardware info"); + return false; + } + + g_assert(type); + *type = info.out_data_type; + g_assert(caps); + *caps = info.out_capabilities; + *max_pasid_log2 = info.out_max_pasid_log2; + + return true; +} + +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = hwpt_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uintptr_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_backend_invalidate_cache(fd, hwpt_id, data_type, entry_len, + *entry_num, cache.entry_num, + (uintptr_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_HWPT_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id) +{ + int ret, fd = be->fd; + struct IOMMUFDViommu *viommu = g_malloc(sizeof(*viommu)); + struct iommu_viommu_alloc alloc_viommu = { + .size = sizeof(alloc_viommu), + .type = viommu_type, + .dev_id = dev_id, + .hwpt_id = hwpt_id, + }; + + if (!viommu) { + error_report("failed to allocate viommu object"); + return NULL; + } + + ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &alloc_viommu); + + trace_iommufd_backend_alloc_viommu(fd, viommu_type, dev_id, hwpt_id, + alloc_viommu.out_viommu_id, ret); + if (ret) { + error_report("IOMMU_VIOMMU_ALLOC failed: %s", strerror(errno)); + g_free(viommu); + return NULL; + } + + viommu->viommu_id = alloc_viommu.out_viommu_id; + viommu->s2_hwpt_id = hwpt_id; + viommu->iommufd = be; + return viommu; +} + +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id) +{ + int ret, fd = viommu->iommufd->fd; + struct IOMMUFDVdev *vdev = g_malloc(sizeof(*vdev)); + struct iommu_vdevice_alloc alloc_vdev = { + .size = sizeof(alloc_vdev), + .viommu_id = viommu->viommu_id, + .dev_id = idev->devid, + .virt_id = virt_id, + }; + + ret = ioctl(fd, IOMMU_VDEVICE_ALLOC, &alloc_vdev); + + trace_iommufd_backend_alloc_vdev(fd, idev->devid, viommu->viommu_id, virt_id, + alloc_vdev.out_vdevice_id, ret); + + if (ret) { + error_report("IOMMU_VDEVICE_ALLOC failed: %s", strerror(errno)); + g_free(vdev); + return NULL; + } + + vdev->idev = idev; + vdev->viommu = viommu; + vdev->virt_id = virt_id; + vdev->vdev_id = alloc_vdev.out_vdevice_id; + return vdev; +} + +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr) +{ + int ret, fd = be->fd; + struct iommu_hwpt_invalidate cache = { + .size = sizeof(cache), + .hwpt_id = viommu_id, + .data_type = data_type, + .entry_len = entry_len, + .entry_num = *entry_num, + .data_uptr = (uint64_t)data_ptr, + }; + + ret = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cache); + + trace_iommufd_viommu_invalidate_cache(fd, viommu_id, data_type, + entry_len, *entry_num, + cache.entry_num, + (uint64_t)data_ptr, ret); + if (ret) { + *entry_num = cache.entry_num; + error_report("IOMMU_VIOMMU_INVALIDATE failed: %s", strerror(errno)); + ret = -errno; + } else { + g_assert(*entry_num == cache.entry_num); + } + + return ret; +} + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->attach_hwpt); + return idevc->attach_hwpt(idev, hwpt_id, errp); +} + +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + HostIOMMUDeviceIOMMUFDClass *idevc = + HOST_IOMMU_DEVICE_IOMMUFD_GET_CLASS(idev); + + g_assert(idevc->detach_hwpt); + return idevc->detach_hwpt(idev, errp); +} + +static int hiod_iommufd_get_cap(HostIOMMUDevice *hiod, int cap, Error **errp) +{ + HostIOMMUDeviceCaps *caps = &hiod->caps; + + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE: + return caps->type; + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static void hiod_iommufd_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->get_cap = hiod_iommufd_get_cap; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_IOMMUFD_BACKEND, + .parent = TYPE_OBJECT, + .instance_size = sizeof(IOMMUFDBackend), + .instance_init = iommufd_backend_init, + .instance_finalize = iommufd_backend_finalize, + .class_size = sizeof(IOMMUFDBackendClass), + .class_init = iommufd_backend_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .parent = TYPE_HOST_IOMMU_DEVICE, + .instance_size = sizeof(HostIOMMUDeviceIOMMUFD), + .class_size = sizeof(HostIOMMUDeviceIOMMUFDClass), + .class_init = hiod_iommufd_class_init, + .abstract = true, + } +}; +DEFINE_TYPES(types) diff --git a/backends/meson.build b/backends/meson.build index 914c7c4afb905cfe710ad23dd1ee42907f6d1679..68b5e34e04959e52472fdf6dba7a703540e4fa4c 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -13,6 +13,7 @@ system_ss.add([files( system_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) system_ss.add(when: 'CONFIG_POSIX', if_true: files('hostmem-file.c')) system_ss.add(when: 'CONFIG_LINUX', if_true: files('hostmem-memfd.c')) +system_ss.add(when: 'CONFIG_LINUX', if_true: files('host_iommu_device.c')) if keyutils.found() system_ss.add(keyutils, files('cryptodev-lkcf.c')) endif @@ -20,6 +21,7 @@ if have_vhost_user system_ss.add(when: 'CONFIG_VIRTIO', if_true: files('vhost-user.c')) endif system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) +system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c')) if have_vhost_user_crypto system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost-user.c')) endif diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index f7f1b4ad7a80588c747883fe212d1207687728ed..0d07df216e47a3ced2a35b030b5c0911394608fc 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -128,10 +128,10 @@ static int tpm_emulator_ctrlcmd(TPMEmulator *tpm, unsigned long cmd, void *msg, CharBackend *dev = &tpm->ctrl_chr; uint32_t cmd_no = cpu_to_be32(cmd); ssize_t n = sizeof(uint32_t) + msg_len_in; - uint8_t *buf = NULL; WITH_QEMU_LOCK_GUARD(&tpm->mutex) { - buf = g_alloca(n); + g_autofree uint8_t *buf = g_malloc(n); + memcpy(buf, &cmd_no, sizeof(cmd_no)); memcpy(buf + sizeof(cmd_no), msg, msg_len_in); diff --git a/backends/trace-events b/backends/trace-events index 652eb76a5723e2053fe97338c481309c58284d6a..f8592a27111234a6a59cf7fc0fcc253268876fb1 100644 --- a/backends/trace-events +++ b/backends/trace-events @@ -5,3 +5,20 @@ dbus_vmstate_pre_save(void) dbus_vmstate_post_load(int version_id) "version_id: %d" dbus_vmstate_loading(const char *id) "id: %s" dbus_vmstate_saving(const char *id) "id: %s" + +# iommufd.c +iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d owned=%d users=%d (%d)" +iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d" +iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d" +iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)" +iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)" +iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d ioas=%d (%d)" +iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)" +iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d (%d)" +iommufd_backend_set_dirty(int iommufd, uint32_t hwpt_id, bool start, int ret) " iommufd=%d hwpt=%u enable=%d (%d)" +iommufd_backend_get_dirty_bitmap(int iommufd, uint32_t hwpt_id, uint64_t iova, uint64_t size, uint64_t page_size, int ret) " iommufd=%d hwpt=%u iova=0x%"PRIx64" size=0x%"PRIx64" page_size=0x%"PRIx64" (%d)" +iommufd_backend_invalidate_cache(int iommufd, uint32_t hwpt_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d hwpt_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" +iommufd_backend_alloc_viommu(int iommufd, uint32_t type, uint32_t dev_id, uint32_t hwpt_id, uint32_t viommu_id, int ret) " iommufd=%d type=%u dev_id=%u hwpt_id=%u viommu_id=%u (%d)" +iommufd_backend_alloc_vdev(int iommufd, uint32_t dev_id, uint32_t viommu_id, uint64_t virt_id, uint32_t vdev_id, int ret) " iommufd=%d dev_id=%u viommu_id=%u virt_id=0x%"PRIx64" vdev_id=%u (%d)" +iommufd_viommu_invalidate_cache(int iommufd, uint32_t viommu_id, uint32_t data_type, uint32_t entry_len, uint32_t entry_num, uint32_t done_num, uint64_t data_ptr, int ret) " iommufd=%d viommu_id=%u data_type=%u entry_len=%u entry_num=%u done_num=%u data_ptr=0x%"PRIx64" (%d)" diff --git a/block.c b/block.c index bfb0861ec61d2965e1ebd1d3d05976ab25a76c7f..6a2abfabcbc0e0cc4daacf7429e0e9129325eef2 100644 --- a/block.c +++ b/block.c @@ -68,6 +68,9 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +#define DEFAULT_BIOS_BOOT_LOADER_DIR "/usr/share/edk2" +#define DEFAULT_NVRAM_TEMPLATE_DIR "/var/lib/libvirt/qemu/nvram" + /* Protected by BQL */ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); @@ -86,6 +89,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename, BlockDriverState *parent, const BdrvChildClass *child_class, BdrvChildRole child_role, + bool parse_filename, Error **errp); static bool bdrv_recurse_has_child(BlockDriverState *bs, @@ -2047,7 +2051,8 @@ static void parse_json_protocol(QDict *options, const char **pfilename, * block driver has been specified explicitly. */ static int bdrv_fill_options(QDict **options, const char *filename, - int *flags, Error **errp) + int *flags, bool allow_parse_filename, + Error **errp) { const char *drvname; bool protocol = *flags & BDRV_O_PROTOCOL; @@ -2089,7 +2094,7 @@ static int bdrv_fill_options(QDict **options, const char *filename, if (protocol && filename) { if (!qdict_haskey(*options, "filename")) { qdict_put_str(*options, "filename", filename); - parse_filename = true; + parse_filename = allow_parse_filename; } else { error_setg(errp, "Can't specify 'file' and 'filename' options at " "the same time"); @@ -3675,7 +3680,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, } backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs, - &child_of_bds, bdrv_backing_role(bs), errp); + &child_of_bds, bdrv_backing_role(bs), true, + errp); if (!backing_hd) { bs->open_flags |= BDRV_O_NO_BACKING; error_prepend(errp, "Could not open backing file: "); @@ -3712,7 +3718,8 @@ free_exit: static BlockDriverState * bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key, BlockDriverState *parent, const BdrvChildClass *child_class, - BdrvChildRole child_role, bool allow_none, Error **errp) + BdrvChildRole child_role, bool allow_none, + bool parse_filename, Error **errp) { BlockDriverState *bs = NULL; QDict *image_options; @@ -3743,7 +3750,8 @@ bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key, } bs = bdrv_open_inherit(filename, reference, image_options, 0, - parent, child_class, child_role, errp); + parent, child_class, child_role, parse_filename, + errp); if (!bs) { goto done; } @@ -3753,6 +3761,37 @@ done: return bs; } +static BdrvChild *bdrv_open_child_common(const char *filename, + QDict *options, const char *bdref_key, + BlockDriverState *parent, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + bool allow_none, bool parse_filename, + Error **errp) +{ + BlockDriverState *bs; + BdrvChild *child; + AioContext *ctx; + + GLOBAL_STATE_CODE(); + + bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class, + child_role, allow_none, parse_filename, errp); + if (bs == NULL) { + return NULL; + } + + bdrv_graph_wrlock(NULL); + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); + child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, + errp); + aio_context_release(ctx); + bdrv_graph_wrunlock(NULL); + + return child; +} + /* * Opens a disk image whose options are given as BlockdevRef in another block * device's options. @@ -3778,31 +3817,15 @@ BdrvChild *bdrv_open_child(const char *filename, BdrvChildRole child_role, bool allow_none, Error **errp) { - BlockDriverState *bs; - BdrvChild *child; - AioContext *ctx; - - GLOBAL_STATE_CODE(); - - bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class, - child_role, allow_none, errp); - if (bs == NULL) { - return NULL; - } - - bdrv_graph_wrlock(NULL); - ctx = bdrv_get_aio_context(bs); - aio_context_acquire(ctx); - child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role, - errp); - aio_context_release(ctx); - bdrv_graph_wrunlock(NULL); - - return child; + return bdrv_open_child_common(filename, options, bdref_key, parent, + child_class, child_role, allow_none, false, + errp); } /* - * Wrapper on bdrv_open_child() for most popular case: open primary child of bs. + * This does mostly the same as bdrv_open_child(), but for opening the primary + * child of a node. A notable difference from bdrv_open_child() is that it + * enables filename parsing for protocol names (including json:). * * The caller must hold the lock of the main AioContext and no other AioContext. * @parent can move to a different AioContext in this function. Callers must @@ -3819,8 +3842,8 @@ int bdrv_open_file_child(const char *filename, role = parent->drv->is_filter ? (BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY) : BDRV_CHILD_IMAGE; - if (!bdrv_open_child(filename, options, bdref_key, parent, - &child_of_bds, role, false, errp)) + if (!bdrv_open_child_common(filename, options, bdref_key, parent, + &child_of_bds, role, false, true, errp)) { return -EINVAL; } @@ -3865,7 +3888,8 @@ BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp) } - bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp); + bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, false, + errp); obj = NULL; qobject_unref(obj); visit_free(v); @@ -3962,7 +3986,7 @@ static BlockDriverState * no_coroutine_fn bdrv_open_inherit(const char *filename, const char *reference, QDict *options, int flags, BlockDriverState *parent, const BdrvChildClass *child_class, BdrvChildRole child_role, - Error **errp) + bool parse_filename, Error **errp) { int ret; BlockBackend *file = NULL; @@ -4011,9 +4035,11 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, } /* json: syntax counts as explicit options, as if in the QDict */ - parse_json_protocol(options, &filename, &local_err); - if (local_err) { - goto fail; + if (parse_filename) { + parse_json_protocol(options, &filename, &local_err); + if (local_err) { + goto fail; + } } bs->explicit_options = qdict_clone_shallow(options); @@ -4038,7 +4064,8 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, parent->open_flags, parent->options); } - ret = bdrv_fill_options(&options, filename, &flags, &local_err); + ret = bdrv_fill_options(&options, filename, &flags, parse_filename, + &local_err); if (ret < 0) { goto fail; } @@ -4107,7 +4134,7 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options, file_bs = bdrv_open_child_bs(filename, options, "file", bs, &child_of_bds, BDRV_CHILD_IMAGE, - true, &local_err); + true, true, &local_err); if (local_err) { goto fail; } @@ -4270,7 +4297,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference, GLOBAL_STATE_CODE(); return bdrv_open_inherit(filename, reference, options, flags, NULL, - NULL, 0, errp); + NULL, 0, true, errp); } /* Return true if the NULL-terminated @list contains @str */ @@ -7017,7 +7044,13 @@ int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp) assert(!(bs->open_flags & BDRV_O_INACTIVE)); assert_bdrv_graph_readable(); - if (bs->drv->bdrv_co_invalidate_cache) { + /* + * It's not necessary for bios bootloader and nvram template to drop cache + * when migration, skip this step for them to avoid dowtime increase. + */ + if (bs->drv->bdrv_co_invalidate_cache && + !strstr(bs->filename, DEFAULT_BIOS_BOOT_LOADER_DIR) && + !strstr(bs->filename, DEFAULT_NVRAM_TEMPLATE_DIR)) { bs->drv->bdrv_co_invalidate_cache(bs, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -7298,6 +7331,22 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp) bdrv_get_device_or_node_name(bs)); return true; } + + /* + * When migration puts a BDRV_O_INACTIVE flag on driver's open_flags, + * we fake a blocker that doesn't exist. From now on, block jobs + * will not be permitted. + */ + if ((op == BLOCK_OP_TYPE_RESIZE || op == BLOCK_OP_TYPE_COMMIT_SOURCE || + op == BLOCK_OP_TYPE_MIRROR_SOURCE || op == BLOCK_OP_TYPE_MIRROR_TARGET) && + (bs->open_flags & BDRV_O_INACTIVE)) { + if (errp) { + error_setg(errp, "block device is in use by migration with" + " a driver BDRV_O_INACTIVE flag setted"); + } + return true; + } + return false; } diff --git a/block/blkio.c b/block/blkio.c index 0a0a6c0f5fde6ae874982671dbe9842d2d8c70e0..52ac94527fb4c788424345d1823d2ad50f88fb6d 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -68,7 +68,7 @@ typedef struct { CoQueue bounce_available; /* The value of the "mem-region-alignment" property */ - size_t mem_region_alignment; + uint64_t mem_region_alignment; /* Can we skip adding/deleting blkio_mem_regions? */ bool needs_mem_regions; @@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) /* Pad size to reduce frequency of resize calls */ bytes += 128 * 1024; + /* Align the pool size to avoid blkio_alloc_mem_region() failure */ + bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { int ret; @@ -896,8 +899,10 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, } bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF; - bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | - BDRV_REQ_NO_FALLBACK; + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; +#ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA + bs->supported_zero_flags |= BDRV_REQ_FUA; +#endif qemu_mutex_init(&s->blkio_lock); qemu_co_mutex_init(&s->bounce_lock); diff --git a/block/block-backend.c b/block/block-backend.c index ec21148806982ae42c5a58526b31069f3c481a71..bfbbb18af1c3faf8b1282007089a70827fbe9068 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -92,6 +92,15 @@ struct BlockBackend { * Accessed with atomic ops. */ unsigned int in_flight; + + /* Timer for retry on errors. */ + QEMUTimer *retry_timer; + /* Interval in ms to trigger next retry. */ + int64_t retry_interval; + /* Start time of the first error. Used to check timeout. */ + int64_t retry_start_time; + /* Retry timeout. 0 represents infinite retry. */ + int64_t retry_timeout; }; typedef struct BlockBackendAIOCB { @@ -368,6 +377,11 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm) blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT; blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC; + blk->retry_timer = NULL; + blk->retry_interval = BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL; + blk->retry_start_time = 0; + blk->retry_timeout = 0; + block_acct_init(&blk->stats); qemu_mutex_init(&blk->queued_requests_lock); @@ -508,6 +522,10 @@ static void blk_delete(BlockBackend *blk) QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); block_acct_cleanup(&blk->stats); + if (blk->retry_timer) { + timer_del(blk->retry_timer); + timer_free(blk->retry_timer); + } g_free(blk); } @@ -1102,6 +1120,14 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, blk->dev_ops = ops; blk->dev_opaque = opaque; + if ((blk->on_read_error == BLOCKDEV_ON_ERROR_RETRY || + blk->on_write_error == BLOCKDEV_ON_ERROR_RETRY) && + ops->retry_request_cb) { + blk->retry_timer = aio_timer_new(blk->ctx, QEMU_CLOCK_REALTIME, + SCALE_MS, ops->retry_request_cb, + opaque); + } + /* Are we currently quiesced? Should we enforce this right now? */ if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) { ops->drained_begin(opaque); @@ -2120,6 +2146,39 @@ void blk_drain_all(void) bdrv_drain_all_end(); } +void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval) +{ + blk->retry_interval = interval; +} + +void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout) +{ + blk->retry_timeout = timeout; +} + +static bool blk_error_retry_timeout(BlockBackend *blk) +{ + /* No timeout set, infinite retries. */ + if (!blk->retry_timeout) { + return false; + } + + /* The first time an error occurs. */ + if (!blk->retry_start_time) { + blk->retry_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + return false; + } + + return qemu_clock_get_ms(QEMU_CLOCK_REALTIME) > (blk->retry_start_time + + blk->retry_timeout); +} + +void blk_error_retry_reset_timeout(BlockBackend *blk) +{ + if (blk->retry_timer && blk->retry_start_time) + blk->retry_start_time = 0; +} + void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error) { @@ -2150,6 +2209,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, return BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_IGNORE: return BLOCK_ERROR_ACTION_IGNORE; + case BLOCKDEV_ON_ERROR_RETRY: + return (blk->retry_timer && !blk_error_retry_timeout(blk)) ? + BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT; case BLOCKDEV_ON_ERROR_AUTO: default: abort(); @@ -2198,6 +2260,12 @@ void blk_error_action(BlockBackend *blk, BlockErrorAction action, qemu_system_vmstop_request_prepare(); send_qmp_error_event(blk, action, is_read, error); qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else if (action == BLOCK_ERROR_ACTION_RETRY) { + if (!blk->quiesce_counter) { + timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + + blk->retry_interval); + send_qmp_error_event(blk, action, is_read, error); + } } else { send_qmp_error_event(blk, action, is_read, error); } diff --git a/block/file-posix.c b/block/file-posix.c index b862406c719307d448908db87905cd4eef50ee34..787f613d5234c2a8a2f6569fc7bb98c64a42fa37 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -128,6 +128,10 @@ #define FTYPE_CD 1 #define MAX_BLOCKSIZE 4096 +#define DEFAULT_BUFFER_SIZE 65536 +#define BUFFER_ALIGN_SIZE 65536 +#define MIN_BUFFER_SIZE 65536 +#define MAX_BUFFER_SIZE 16777216 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, * leaving a few more bytes for its future use. */ @@ -158,6 +162,7 @@ typedef struct BDRVRawState { bool has_discard:1; bool has_write_zeroes:1; + bool discard_zeroes:1; bool use_linux_aio:1; bool use_linux_io_uring:1; int page_cache_inconsistent; /* errno from fdatasync failure */ @@ -202,6 +207,8 @@ typedef struct RawPosixAIOData { off_t aio_offset; uint64_t aio_nbytes; + size_t buffer_size; + union { struct { struct iovec *iov; @@ -765,6 +772,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } else { + s->discard_zeroes = true; s->has_fallocate = true; } } else { @@ -790,12 +798,19 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, #endif if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES + unsigned int arg; + if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { + s->discard_zeroes = true; + } +#endif #ifdef __linux__ /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do * not rely on the contents of discarded blocks unless using O_DIRECT. * Same for BLKZEROOUT. */ if (!(bs->open_flags & BDRV_O_NOCACHE)) { + s->discard_zeroes = false; s->has_write_zeroes = false; } #endif @@ -813,7 +828,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, #endif s->needs_alignment = raw_needs_alignment(bs); - bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; + bs->supported_zero_flags = s->discard_zeroes ? (BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) : 0; if (S_ISREG(st.st_mode)) { /* When extending regular files, we get zeros from the OS */ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; @@ -1408,7 +1423,7 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, Error **errp) { BDRVRawState *s = bs->opaque; - BlockZoneModel zoned; + BlockZoneModel zoned = BLK_Z_NONE; int ret; ret = get_sysfs_zoned_model(st, &zoned); @@ -2621,7 +2636,8 @@ static void raw_close(BlockDriverState *bs) */ static int coroutine_fn raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, - PreallocMode prealloc, Error **errp) + PreallocMode prealloc, size_t buffer_size, + Error **errp) { RawPosixAIOData acb; @@ -2630,6 +2646,7 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset, .aio_fildes = fd, .aio_type = QEMU_AIO_TRUNCATE, .aio_offset = offset, + .buffer_size = buffer_size, .truncate = { .prealloc = prealloc, .errp = errp, @@ -2655,7 +2672,8 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset, if (S_ISREG(st.st_mode)) { /* Always resizes to the exact @offset */ - return raw_regular_truncate(bs, s->fd, offset, prealloc, errp); + return raw_regular_truncate(bs, s->fd, offset, prealloc, + DEFAULT_BUFFER_SIZE, errp); } if (prealloc != PREALLOC_MODE_OFF) { @@ -2873,6 +2891,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) int fd; uint64_t perm, shared; int result = 0; + int flags = O_RDWR | O_BINARY; + size_t buffer_size = DEFAULT_BUFFER_SIZE; /* Validate options and set default values */ assert(options->driver == BLOCKDEV_DRIVER_FILE); @@ -2892,9 +2912,19 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) error_setg(errp, "Extent size hint is too large"); goto out; } + if (!file_opts->cache) { + file_opts->cache = g_strdup("writeback"); + } + if (file_opts->preallocation == PREALLOC_MODE_FULL && + !strcmp(file_opts->cache, "none")) { + flags |= O_DIRECT; + } + if (file_opts->has_buffersize) { + buffer_size = file_opts->buffersize; + } /* Create file */ - fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp); + fd = qemu_create(file_opts->filename, flags, 0644, errp); if (fd < 0) { result = -errno; goto out; @@ -2929,7 +2959,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) } /* Clear the file by truncating it to 0 */ - result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp); + result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, + buffer_size, errp); if (result < 0) { goto out_unlock; } @@ -2973,7 +3004,8 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp) /* Resize and potentially preallocate the file to the desired * final size */ result = raw_regular_truncate(NULL, fd, file_opts->size, - file_opts->preallocation, errp); + file_opts->preallocation, + buffer_size, errp); if (result < 0) { goto out_unlock; } @@ -2994,6 +3026,8 @@ out_close: error_setg_errno(errp, -result, "Could not close the new file"); } out: + g_free(file_opts->cache); + file_opts->cache = NULL; return result; } @@ -3009,6 +3043,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, PreallocMode prealloc; char *buf = NULL; Error *local_err = NULL; + size_t buffersize = DEFAULT_BUFFER_SIZE; + char *cache = NULL; /* Skip file: protocol prefix */ strstart(filename, "file:", &filename); @@ -3031,6 +3067,21 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, return -EINVAL; } + buffersize = qemu_opt_get_size_del(opts, BLOCK_OPT_BUFFER_SIZE, + DEFAULT_BUFFER_SIZE); + if (buffersize < MIN_BUFFER_SIZE || buffersize > MAX_BUFFER_SIZE) { + error_setg_errno(errp, EINVAL, "Buffer size must be between %d " + "and %d", MIN_BUFFER_SIZE, MAX_BUFFER_SIZE); + return -EINVAL; + } + + cache = qemu_opt_get_del(opts, BLOCK_OPT_CACHE); + if (!cache) { + cache = g_strdup("writeback"); + } + + buffersize = ROUND_UP(buffersize, BUFFER_ALIGN_SIZE); + options = (BlockdevCreateOptions) { .driver = BLOCKDEV_DRIVER_FILE, .u.file = { @@ -3042,6 +3093,9 @@ raw_co_create_opts(BlockDriver *drv, const char *filename, .nocow = nocow, .has_extent_size_hint = has_extent_size_hint, .extent_size_hint = extent_size_hint, + .has_buffersize = true, + .buffersize = buffersize, + .cache = cache, }, }; return raw_co_create(&options, errp); @@ -3732,6 +3786,16 @@ static QemuOptsList raw_create_opts = { .type = QEMU_OPT_SIZE, .help = "Extent size hint for the image file, 0 to disable" }, + { + .name = BLOCK_OPT_CACHE, + .type = QEMU_OPT_STRING, + .help = "Cache mode (allowed values: writeback, none)" + }, + { + .name = BLOCK_OPT_BUFFER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "write buffer size" + }, { /* end of list */ } } }; diff --git a/block/io.c b/block/io.c index 7e62fabbf57cc9865492c62b0181ad9e8396769e..27d6a1a04bcc3a75cb3cd423f0375883f6305771 100644 --- a/block/io.c +++ b/block/io.c @@ -1756,22 +1756,29 @@ static int bdrv_pad_request(BlockDriverState *bs, return 0; } - sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, - &sliced_head, &sliced_tail, - &sliced_niov); - - /* Guaranteed by bdrv_check_request32() */ - assert(*bytes <= SIZE_MAX); - ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, - sliced_head, *bytes); - if (ret < 0) { - bdrv_padding_finalize(pad); - return ret; + /* + * For prefetching in stream_populate(), no qiov is passed along, because + * only copy-on-read matters. + */ + if (qiov && *qiov) { + sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, + &sliced_head, &sliced_tail, + &sliced_niov); + + /* Guaranteed by bdrv_check_request32() */ + assert(*bytes <= SIZE_MAX); + ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, + sliced_head, *bytes); + if (ret < 0) { + bdrv_padding_finalize(pad); + return ret; + } + *qiov = &pad->local_qiov; + *qiov_offset = 0; } + *bytes += pad->head + pad->tail; *offset -= pad->head; - *qiov = &pad->local_qiov; - *qiov_offset = 0; if (padded) { *padded = true; } @@ -1885,6 +1892,11 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, return -EINVAL; } + /* If opened with discard=off we should never unmap. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); @@ -2338,10 +2350,6 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); assert_bdrv_graph_readable(); - if (!(child->bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - return bdrv_co_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); } diff --git a/block/mirror.c b/block/mirror.c index cd9d3ad4a8065ac18528f1ac07a25093d7b496cb..20b3e8e5d825a1410b72834bbd312ae3af337263 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1774,7 +1774,7 @@ static BlockJob *mirror_start_job( * reads on the top, while disabling it in the intermediate nodes, and make * the backing chain writable. */ mirror_top_bs = bdrv_new_open_driver(&bdrv_mirror_top, filter_node_name, - BDRV_O_RDWR, errp); + BDRV_O_RDWR | BDRV_O_NOCACHE, errp); if (mirror_top_bs == NULL) { return NULL; } diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index c729cbf1eb8ab228f35ae6b6e87f2a6809fdab3b..78a6975852d01e4131f33e7930afe908107ef16a 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -415,7 +415,8 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict) goto exit; } - nbd_server_start(addr, NULL, NULL, 0, &local_err); + nbd_server_start(addr, NULL, NULL, NBD_DEFAULT_MAX_CONNECTIONS, + &local_err); qapi_free_SocketAddress(addr); if (local_err != NULL) { goto exit; diff --git a/block/parallels.c b/block/parallels.c index 9205a0864fa6ba96319eb58b3da54d11e99dcadb..8f2b58e1c9978c71a4b185d4757d636b42593cb1 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -1298,6 +1298,10 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "Catalog too large"); return -EFBIG; } + if (le64_to_cpu(ph.ext_off) >= (INT64_MAX >> BDRV_SECTOR_BITS)) { + error_setg(errp, "Invalid image: Too big offset"); + return -EFBIG; + } size = bat_entry_off(s->bat_size); s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs)); diff --git a/block/qcow2.c b/block/qcow2.c index 13e032bd5e2e07565bfd29cfd481560437bcd609..7af7c0bee4cfd93987d02b6b3cb3294c86a0b1b1 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1636,7 +1636,22 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - if (open_data_file) { + if (open_data_file && (flags & BDRV_O_NO_IO)) { + /* + * Don't open the data file for 'qemu-img info' so that it can be used + * to verify that an untrusted qcow2 image doesn't refer to external + * files. + * + * Note: This still makes has_data_file() return true. + */ + if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) { + s->data_file = NULL; + } else { + s->data_file = bs->file; + } + qdict_extract_subqdict(options, NULL, "data-file."); + qdict_del(options, "data-file"); + } else if (open_data_file) { /* Open external data file */ bdrv_graph_co_rdunlock(); s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs, diff --git a/block/raw-format.c b/block/raw-format.c index 1111dffd54f7da49bffa43d6177208b96e929aba..8195ed87cc76bd03a0e633ca797105db25f9c485 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -111,7 +111,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset, if (offset > real_size) { error_setg(errp, "Offset (%" PRIu64 ") cannot be greater than " "size of the containing file (%" PRId64 ")", - s->offset, real_size); + offset, real_size); return -EINVAL; } @@ -119,7 +119,7 @@ raw_apply_options(BlockDriverState *bs, BDRVRawState *s, uint64_t offset, error_setg(errp, "The sum of offset (%" PRIu64 ") and size " "(%" PRIu64 ") has to be smaller or equal to the " " actual size of the containing file (%" PRId64 ")", - s->offset, s->size, real_size); + offset, size, real_size); return -EINVAL; } diff --git a/block/vvfat.c b/block/vvfat.c index 9d050ba3aea08eeac3d284e7d1e41545ff5b9f9f..9010f3f33f7dfe3b29610d9b26e6a2b488a12213 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -2525,7 +2525,7 @@ commit_one_file(BDRVVVFATState* s, int dir_index, uint32_t offset) return -1; } - for (i = s->cluster_size; i < offset; i += s->cluster_size) + for (i = 0; i < offset; i += s->cluster_size) c = modified_fat_get(s, c); fd = qemu_open_old(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 213012435f4616f7c84bc436c10dab158994a368..b36f41b7c5ae731957fd878204db4c16afe7dce5 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -21,12 +21,18 @@ #include "io/channel-socket.h" #include "io/net-listener.h" +typedef struct NBDConn { + QIOChannelSocket *cioc; + QLIST_ENTRY(NBDConn) next; +} NBDConn; + typedef struct NBDServerData { QIONetListener *listener; QCryptoTLSCreds *tlscreds; char *tlsauthz; uint32_t max_connections; uint32_t connections; + QLIST_HEAD(, NBDConn) conns; } NBDServerData; static NBDServerData *nbd_server; @@ -51,6 +57,14 @@ int nbd_server_max_connections(void) static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) { + NBDConn *conn = nbd_client_owner(client); + + assert(qemu_in_main_thread() && nbd_server); + + object_unref(OBJECT(conn->cioc)); + QLIST_REMOVE(conn, next); + g_free(conn); + nbd_client_put(client); assert(nbd_server->connections > 0); nbd_server->connections--; @@ -60,31 +74,56 @@ static void nbd_blockdev_client_closed(NBDClient *client, bool ignored) static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) { + NBDConn *conn = g_new0(NBDConn, 1); + + assert(qemu_in_main_thread() && nbd_server); nbd_server->connections++; + object_ref(OBJECT(cioc)); + conn->cioc = cioc; + QLIST_INSERT_HEAD(&nbd_server->conns, conn, next); nbd_update_server_watch(nbd_server); qio_channel_set_name(QIO_CHANNEL(cioc), "nbd-server"); - nbd_client_new(cioc, nbd_server->tlscreds, nbd_server->tlsauthz, - nbd_blockdev_client_closed); + /* TODO - expose handshake timeout as QMP option */ + nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, + nbd_server->tlscreds, nbd_server->tlsauthz, + nbd_blockdev_client_closed, conn); } static void nbd_update_server_watch(NBDServerData *s) { - if (!s->max_connections || s->connections < s->max_connections) { - qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, NULL); - } else { - qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); + if (s->listener) { + if (!s->max_connections || s->connections < s->max_connections) { + qio_net_listener_set_client_func(s->listener, nbd_accept, NULL, + NULL); + } else { + qio_net_listener_set_client_func(s->listener, NULL, NULL, NULL); + } } } static void nbd_server_free(NBDServerData *server) { + NBDConn *conn, *tmp; + if (!server) { return; } + /* + * Forcefully close the listener socket, and any clients that have + * not yet disconnected on their own. + */ qio_net_listener_disconnect(server->listener); object_unref(OBJECT(server->listener)); + server->listener = NULL; + QLIST_FOREACH_SAFE(conn, &server->conns, next, tmp) { + qio_channel_shutdown(QIO_CHANNEL(conn->cioc), QIO_CHANNEL_SHUTDOWN_BOTH, + NULL); + } + + AIO_WAIT_WHILE_UNLOCKED(NULL, server->connections > 0); + if (server->tlscreds) { object_unref(OBJECT(server->tlscreds)); } @@ -168,6 +207,10 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds, void nbd_server_start_options(NbdServerOptions *arg, Error **errp) { + if (!arg->has_max_connections) { + arg->max_connections = NBD_DEFAULT_MAX_CONNECTIONS; + } + nbd_server_start(arg->addr, arg->tls_creds, arg->tls_authz, arg->max_connections, errp); } @@ -180,6 +223,10 @@ void qmp_nbd_server_start(SocketAddressLegacy *addr, { SocketAddress *addr_flat = socket_address_flatten(addr); + if (!has_max_connections) { + max_connections = NBD_DEFAULT_MAX_CONNECTIONS; + } + nbd_server_start(addr_flat, tls_creds, tls_authz, max_connections, errp); qapi_free_SocketAddress(addr_flat); } diff --git a/blockdev.c b/blockdev.c index c91f49e7b62c04a3bd8155826294692a4ebfa5fc..d2fe5c361c6302f57eb67459f5d21c68743bc3fb 100644 --- a/blockdev.c +++ b/blockdev.c @@ -326,6 +326,8 @@ static int parse_block_error_action(const char *buf, bool is_read, Error **errp) return BLOCKDEV_ON_ERROR_STOP; } else if (!strcmp(buf, "report")) { return BLOCKDEV_ON_ERROR_REPORT; + } else if (!strcmp(buf, "retry")) { + return BLOCKDEV_ON_ERROR_RETRY; } else { error_setg(errp, "'%s' invalid %s error action", buf, is_read ? "read" : "write"); @@ -482,6 +484,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, const char *buf; int bdrv_flags = 0; int on_read_error, on_write_error; + int64_t retry_interval, retry_timeout; OnOffAuto account_invalid, account_failed; bool writethrough, read_only; BlockBackend *blk; @@ -493,6 +496,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, QDict *interval_dict = NULL; QList *interval_list = NULL; const char *id; + const char *cache; BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; const char *throttling_group = NULL; @@ -556,7 +560,7 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, qdict_put_str(bs_opts, "driver", buf); } - on_write_error = BLOCKDEV_ON_ERROR_ENOSPC; + on_write_error = BLOCKDEV_ON_ERROR_REPORT; if ((buf = qemu_opt_get(opts, "werror")) != NULL) { on_write_error = parse_block_error_action(buf, 0, &error); if (error) { @@ -574,12 +578,31 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, } } + retry_interval = qemu_opt_get_number(opts, "retry_interval", + BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL); + retry_timeout = qemu_opt_get_number(opts, "retry_timeout", 0); + if (snapshot) { bdrv_flags |= BDRV_O_SNAPSHOT; } read_only = qemu_opt_get_bool(opts, BDRV_OPT_READ_ONLY, false); + if ((!file || !*file) && qdict_size(bs_opts) == 2) { + cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH); + if (cache && !strcmp(cache, "on")) { + bdrv_flags |= BDRV_O_NO_FLUSH; + } + + cache = qdict_get_try_str(bs_opts, BDRV_OPT_CACHE_DIRECT); + if (cache && !strcmp(cache, "on")) { + bdrv_flags |= BDRV_O_NOCACHE; + } + + qdict_del(bs_opts, BDRV_OPT_CACHE_NO_FLUSH); + qdict_del(bs_opts, BDRV_OPT_CACHE_DIRECT); + } + /* init */ if ((!file || !*file) && !qdict_size(bs_opts)) { BlockBackendRootState *blk_rs; @@ -637,6 +660,11 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts, blk_set_enable_write_cache(blk, !writethrough); blk_set_on_error(blk, on_read_error, on_write_error); + if (on_read_error == BLOCKDEV_ON_ERROR_RETRY || + on_write_error == BLOCKDEV_ON_ERROR_RETRY) { + blk_set_on_error_retry_interval(blk, retry_interval); + blk_set_on_error_retry_timeout(blk, retry_timeout); + } if (!monitor_add_blk(blk, id, errp)) { blk_unref(blk); @@ -771,6 +799,14 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "werror", .type = QEMU_OPT_STRING, .help = "write error action", + },{ + .name = "retry_interval", + .type = QEMU_OPT_NUMBER, + .help = "interval for retry action in millisecond", + },{ + .name = "retry_timeout", + .type = QEMU_OPT_NUMBER, + .help = "timeout for retry action in millisecond", },{ .name = "copy-on-read", .type = QEMU_OPT_BOOL, @@ -793,6 +829,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type, BlockInterfaceType type; int max_devs, bus_id, unit_id, index; const char *werror, *rerror; + int64_t retry_interval, retry_timeout; bool read_only = false; bool copy_on_read; const char *filename; @@ -1011,6 +1048,29 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type, qdict_put_str(bs_opts, "rerror", rerror); } + if (qemu_opt_find(legacy_opts, "retry_interval")) { + if ((werror == NULL || strcmp(werror, "retry")) && + (rerror == NULL || strcmp(rerror, "retry"))) { + error_setg(errp, "retry_interval is only supported " + "by werror/rerror=retry"); + goto fail; + } + retry_interval = qemu_opt_get_number(legacy_opts, "retry_interval", + BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL); + qdict_put_int(bs_opts, "retry_interval", retry_interval); + } + + if (qemu_opt_find(legacy_opts, "retry_timeout")) { + if ((werror == NULL || strcmp(werror, "retry")) && + (rerror == NULL || strcmp(rerror, "retry"))) { + error_setg(errp, "retry_timeout is only supported " + "by werror/rerror=retry"); + goto fail; + } + retry_timeout = qemu_opt_get_number(legacy_opts, "retry_timeout", 0); + qdict_put_int(bs_opts, "retry_timeout", retry_timeout); + } + /* Actual block device init: Functionality shared with blockdev-add */ blk = blockdev_init(filename, bs_opts, errp); bs_opts = NULL; @@ -3792,6 +3852,14 @@ QemuOptsList qemu_common_drive_opts = { .name = "werror", .type = QEMU_OPT_STRING, .help = "write error action", + },{ + .name = "retry_interval", + .type = QEMU_OPT_NUMBER, + .help = "interval for retry action in millisecond", + },{ + .name = "retry_timeout", + .type = QEMU_OPT_NUMBER, + .help = "timeout for retry action in millisecond", },{ .name = BDRV_OPT_READ_ONLY, .type = QEMU_OPT_BOOL, diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 73947da188d68d78fdb3ca78f4045db3519aab7a..0c9ab069aee3a88b7c7dbc353b60f32140c1aa43 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -337,6 +337,28 @@ static GSource *tcp_chr_add_watch(Chardev *chr, GIOCondition cond) return qio_channel_create_watch(s->ioc, cond); } +static void tcp_chr_set_reconnect_time(Chardev *chr, + int64_t reconnect_time) +{ + SocketChardev *s = SOCKET_CHARDEV(chr); + s->reconnect_time = reconnect_time; +} + +void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time) +{ + ChardevClass *cc = CHARDEV_GET_CLASS(chr); + SocketChardev *s = SOCKET_CHARDEV(chr); + + /* if sock dev is listen, dont set reconnect time */ + if (s->is_listen) { + return; + } + + if (cc->chr_set_reconnect_time) { + cc->chr_set_reconnect_time(chr, reconnect_time); + } +} + static void remove_hup_source(SocketChardev *s) { if (s->hup_source != NULL) { @@ -492,9 +514,9 @@ static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) s->max_size <= 0) { return TRUE; } - len = sizeof(buf); - if (len > s->max_size) { - len = s->max_size; + len = tcp_chr_read_poll(opaque); + if (len > sizeof(buf)) { + len = sizeof(buf); } size = tcp_chr_recv(chr, (void *)buf, len); if (size == 0 || (size == -1 && errno != EAGAIN)) { @@ -537,7 +559,7 @@ static int tcp_chr_sync_read(Chardev *chr, const uint8_t *buf, int len) if (s->state != TCP_CHARDEV_STATE_DISCONNECTED) { qio_channel_set_blocking(s->ioc, false, NULL); } - if (size == 0) { + if (size == 0 && chr->chr_for_flag != CHR_FOR_VHOST_USER) { /* connection closed */ tcp_chr_disconnect(chr); } @@ -1543,6 +1565,7 @@ static void char_socket_class_init(ObjectClass *oc, void *data) cc->set_msgfds = tcp_set_msgfds; cc->chr_add_client = tcp_chr_add_client; cc->chr_add_watch = tcp_chr_add_watch; + cc->chr_set_reconnect_time = tcp_chr_set_reconnect_time; cc->chr_update_read_handler = tcp_chr_update_read_handler; object_class_property_add(oc, "addr", "SocketAddress", diff --git a/chardev/char-stdio.c b/chardev/char-stdio.c index 3c648678ab14c7bbbd802a0400e3d5625f81207b..b960ddd4e4ca4da5dd6e4d2b2554dc83c532dd6f 100644 --- a/chardev/char-stdio.c +++ b/chardev/char-stdio.c @@ -41,6 +41,7 @@ /* init terminal so that we can grab keys */ static struct termios oldtty; static int old_fd0_flags; +static int old_fd1_flags; static bool stdio_in_use; static bool stdio_allow_signal; static bool stdio_echo_state; @@ -50,6 +51,8 @@ static void term_exit(void) if (stdio_in_use) { tcsetattr(0, TCSANOW, &oldtty); fcntl(0, F_SETFL, old_fd0_flags); + fcntl(1, F_SETFL, old_fd1_flags); + stdio_in_use = false; } } @@ -102,6 +105,7 @@ static void qemu_chr_open_stdio(Chardev *chr, stdio_in_use = true; old_fd0_flags = fcntl(0, F_GETFL); + old_fd1_flags = fcntl(1, F_GETFL); tcgetattr(0, &oldtty); if (!g_unix_set_fd_nonblocking(0, true, NULL)) { error_setg_errno(errp, errno, "Failed to set FD nonblocking"); diff --git a/configs/devices/aarch64-softmmu/default.mak b/configs/devices/aarch64-softmmu/default.mak index f82a04c27d1ba66a1199d0d249e48b69828f8bb6..8d66d0f1af7ce1b857e051b19227b508b4a20f16 100644 --- a/configs/devices/aarch64-softmmu/default.mak +++ b/configs/devices/aarch64-softmmu/default.mak @@ -8,3 +8,4 @@ include ../arm-softmmu/default.mak # CONFIG_XLNX_ZYNQMP_ARM=n # CONFIG_XLNX_VERSAL=n # CONFIG_SBSA_REF=n +# CONFIG_CPUFREQ=n diff --git a/configs/devices/i386-softmmu/default.mak b/configs/devices/i386-softmmu/default.mak index 598c6646dfc0f941385b4d07939739bf9ae53e5c..e948e54e4e902a26f257b9a89ccb66bc77703e21 100644 --- a/configs/devices/i386-softmmu/default.mak +++ b/configs/devices/i386-softmmu/default.mak @@ -23,6 +23,8 @@ #CONFIG_TPM_TIS_ISA=n #CONFIG_VTD=n #CONFIG_SGX=n +#CONFIG_CSV=n +#CONFIG_HYGON_CSV_MIG_ACCEL=n # Boards: # diff --git a/configs/devices/loongarch64-softmmu/default.mak b/configs/devices/loongarch64-softmmu/default.mak index 928bc117ef781a5f64512b972b459751d08717ce..ffe705836fde47b47f82db9baa8d1b8073eec4df 100644 --- a/configs/devices/loongarch64-softmmu/default.mak +++ b/configs/devices/loongarch64-softmmu/default.mak @@ -1,3 +1,7 @@ # Default configuration for loongarch64-softmmu -CONFIG_LOONGARCH_VIRT=y +# Uncomment the following lines to disable these optional devices: +# CONFIG_PCI_DEVICES=n + +# Boards are selected by default, uncomment to keep out of the build. +# CONFIG_LOONGARCH_VIRT=n diff --git a/configs/targets/loongarch64-softmmu.mak b/configs/targets/loongarch64-softmmu.mak index f23780fdd89945b82078b1680273bfa93d9f2ff1..0034c336200fb0629775e713982da70309464a54 100644 --- a/configs/targets/loongarch64-softmmu.mak +++ b/configs/targets/loongarch64-softmmu.mak @@ -1,5 +1,6 @@ TARGET_ARCH=loongarch64 TARGET_BASE_ARCH=loongarch +TARGET_KVM_HAVE_GUEST_DEBUG=y TARGET_SUPPORTS_MTTCG=y TARGET_XML_FILES= gdb-xml/loongarch-base32.xml gdb-xml/loongarch-base64.xml gdb-xml/loongarch-fpu.xml TARGET_NEED_FDT=y diff --git a/configure b/configure index bdda912f3626000ed0006276a36919bcf30821c1..6036de83a4abf7516e65c36d71c47ebd93779977 100755 --- a/configure +++ b/configure @@ -445,6 +445,7 @@ case "$cpu" in loongarch*) cpu=loongarch64 host_arch=loongarch64 + linux_arch=loongarch ;; mips64*) diff --git a/contrib/plugins/lockstep.c b/contrib/plugins/lockstep.c index 237543b43a76a1e657ddaef150314fc295366764..0c6f060183ac8cdb5d50db4488ea3cebd17ae9f7 100644 --- a/contrib/plugins/lockstep.c +++ b/contrib/plugins/lockstep.c @@ -100,6 +100,31 @@ static void plugin_exit(qemu_plugin_id_t id, void *p) plugin_cleanup(id); } +/* + * g_memdup has been deprecated in Glib since 2.68 and + * will complain about it if you try to use it. However until + * glib_req_ver for QEMU is bumped we make a copy of the glib-compat + * handler. + */ +static inline gpointer g_memdup2_qemu(gconstpointer mem, gsize byte_size) +{ +#if GLIB_CHECK_VERSION(2, 68, 0) + return g_memdup2(mem, byte_size); +#else + gpointer new_mem; + + if (mem && byte_size != 0) { + new_mem = g_malloc(byte_size); + memcpy(new_mem, mem, byte_size); + } else { + new_mem = NULL; + } + + return new_mem; +#endif +} +#define g_memdup2(m, s) g_memdup2_qemu(m, s) + static void report_divergance(ExecState *us, ExecState *them) { DivergeState divrec = { log, 0 }; diff --git a/contrib/vhost-user-gpu/virgl.c b/contrib/vhost-user-gpu/virgl.c index d1ccdf7d066853a4bd60be0f9b7a4730c7ca5c1c..51da0e3667f9ef8ffa280f02f8a1c8f4c8c84354 100644 --- a/contrib/vhost-user-gpu/virgl.c +++ b/contrib/vhost-user-gpu/virgl.c @@ -327,7 +327,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id, #ifdef VIRGL_RENDERER_RESOURCE_INFO_EXT_VERSION struct virgl_renderer_resource_info_ext info_ext; ret = virgl_renderer_resource_get_info_ext(resource_id, &info_ext); - if (ret < 0) { + if (ret) { return ret; } @@ -335,7 +335,7 @@ virgl_get_resource_info_modifiers(uint32_t resource_id, *modifiers = info_ext.modifiers; #else ret = virgl_renderer_resource_get_info(resource_id, info); - if (ret < 0) { + if (ret) { return ret; } @@ -372,7 +372,7 @@ virgl_cmd_set_scanout(VuGpu *g, uint64_t modifiers = 0; ret = virgl_get_resource_info_modifiers(ss.resource_id, &info, &modifiers); - if (ret == -1) { + if (ret) { g_critical("%s: illegal resource specified %d\n", __func__, ss.resource_id); cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; diff --git a/cpu-common.c b/cpu-common.c index c81fd72d16d5c9e5f6f8af2d10b6f8f20e9cd3a6..a949ad7ca3cf5a526efed41c089d0646a5b10faa 100644 --- a/cpu-common.c +++ b/cpu-common.c @@ -24,6 +24,7 @@ #include "sysemu/cpus.h" #include "qemu/lockable.h" #include "trace/trace-root.h" +#include "hw/boards.h" QemuMutex qemu_cpu_list_lock; static QemuCond exclusive_cond; @@ -107,6 +108,46 @@ void cpu_list_remove(CPUState *cpu) cpu_list_generation_id++; } +CPUState *qemu_get_possible_cpu(int index) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + const CPUArchIdList *possible_cpus = ms->possible_cpus; + + if (possible_cpus == NULL) { + return qemu_get_cpu(index); + } + + assert((index >= 0) && (index < possible_cpus->len)); + + return CPU(possible_cpus->cpus[index].cpu); +} + +bool qemu_present_cpu(CPUState *cpu) +{ + return cpu; +} + +bool qemu_enabled_cpu(CPUState *cpu) +{ + return cpu && !cpu->disabled; +} + +bool qemu_persistent_cpu(CPUState *cpu) +{ + /* cpu state can be faked to the guest via acpi */ + return cpu->acpi_persistent; +} + +uint64_t qemu_get_cpu_archid(int cpu_index) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + const CPUArchIdList *possible_cpus = ms->possible_cpus; + + assert((cpu_index >= 0) && (cpu_index < possible_cpus->len)); + + return possible_cpus->cpus[cpu_index].arch_id; +} + CPUState *qemu_get_cpu(int index) { CPUState *cpu; @@ -193,6 +234,9 @@ void start_exclusive(void) CPUState *other_cpu; int running_cpus; + /* Ensure we are not running, or start_exclusive will be blocked. */ + g_assert(!current_cpu->running); + if (current_cpu->exclusive_context_count) { current_cpu->exclusive_context_count++; return; diff --git a/crypto/block-luks.c b/crypto/block-luks.c index fb01ec38bbf00e29cadae5deeee1e4304d9a1e85..f0813d69b4d13b584d48d2565d85136f525da7bb 100644 --- a/crypto/block-luks.c +++ b/crypto/block-luks.c @@ -95,12 +95,23 @@ qcrypto_block_luks_cipher_size_map_twofish[] = { { 0, 0 }, }; +#ifdef CONFIG_CRYPTO_SM4 +static const QCryptoBlockLUKSCipherSizeMap +qcrypto_block_luks_cipher_size_map_sm4[] = { + { 16, QCRYPTO_CIPHER_ALG_SM4}, + { 0, 0 }, +}; +#endif + static const QCryptoBlockLUKSCipherNameMap qcrypto_block_luks_cipher_name_map[] = { { "aes", qcrypto_block_luks_cipher_size_map_aes }, { "cast5", qcrypto_block_luks_cipher_size_map_cast5 }, { "serpent", qcrypto_block_luks_cipher_size_map_serpent }, { "twofish", qcrypto_block_luks_cipher_size_map_twofish }, +#ifdef CONFIG_CRYPTO_SM4 + { "sm4", qcrypto_block_luks_cipher_size_map_sm4}, +#endif }; QEMU_BUILD_BUG_ON(sizeof(struct QCryptoBlockLUKSKeySlot) != 48); diff --git a/crypto/cipher-gcrypt.c.inc b/crypto/cipher-gcrypt.c.inc index a6a0117717f5bc1061e91612981ee26eaa8a1d3b..6b82280f9000295a1003ef4a164a32a913f8e32a 100644 --- a/crypto/cipher-gcrypt.c.inc +++ b/crypto/cipher-gcrypt.c.inc @@ -20,6 +20,56 @@ #include +static int qcrypto_cipher_alg_to_gcry_alg(QCryptoCipherAlgorithm alg) +{ + switch (alg) { + case QCRYPTO_CIPHER_ALG_DES: + return GCRY_CIPHER_DES; + case QCRYPTO_CIPHER_ALG_3DES: + return GCRY_CIPHER_3DES; + case QCRYPTO_CIPHER_ALG_AES_128: + return GCRY_CIPHER_AES128; + case QCRYPTO_CIPHER_ALG_AES_192: + return GCRY_CIPHER_AES192; + case QCRYPTO_CIPHER_ALG_AES_256: + return GCRY_CIPHER_AES256; + case QCRYPTO_CIPHER_ALG_CAST5_128: + return GCRY_CIPHER_CAST5; + case QCRYPTO_CIPHER_ALG_SERPENT_128: + return GCRY_CIPHER_SERPENT128; + case QCRYPTO_CIPHER_ALG_SERPENT_192: + return GCRY_CIPHER_SERPENT192; + case QCRYPTO_CIPHER_ALG_SERPENT_256: + return GCRY_CIPHER_SERPENT256; + case QCRYPTO_CIPHER_ALG_TWOFISH_128: + return GCRY_CIPHER_TWOFISH128; + case QCRYPTO_CIPHER_ALG_TWOFISH_256: + return GCRY_CIPHER_TWOFISH; +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: + return GCRY_CIPHER_SM4; +#endif + default: + return GCRY_CIPHER_NONE; + } +} + +static int qcrypto_cipher_mode_to_gcry_mode(QCryptoCipherMode mode) +{ + switch (mode) { + case QCRYPTO_CIPHER_MODE_ECB: + return GCRY_CIPHER_MODE_ECB; + case QCRYPTO_CIPHER_MODE_XTS: + return GCRY_CIPHER_MODE_XTS; + case QCRYPTO_CIPHER_MODE_CBC: + return GCRY_CIPHER_MODE_CBC; + case QCRYPTO_CIPHER_MODE_CTR: + return GCRY_CIPHER_MODE_CTR; + default: + return GCRY_CIPHER_MODE_NONE; + } +} + bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, QCryptoCipherMode mode) { @@ -35,6 +85,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_ALG_SERPENT_256: case QCRYPTO_CIPHER_ALG_TWOFISH_128: case QCRYPTO_CIPHER_ALG_TWOFISH_256: +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: +#endif break; default: return false; @@ -185,67 +238,26 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, return NULL; } - switch (alg) { - case QCRYPTO_CIPHER_ALG_DES: - gcryalg = GCRY_CIPHER_DES; - break; - case QCRYPTO_CIPHER_ALG_3DES: - gcryalg = GCRY_CIPHER_3DES; - break; - case QCRYPTO_CIPHER_ALG_AES_128: - gcryalg = GCRY_CIPHER_AES128; - break; - case QCRYPTO_CIPHER_ALG_AES_192: - gcryalg = GCRY_CIPHER_AES192; - break; - case QCRYPTO_CIPHER_ALG_AES_256: - gcryalg = GCRY_CIPHER_AES256; - break; - case QCRYPTO_CIPHER_ALG_CAST5_128: - gcryalg = GCRY_CIPHER_CAST5; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_128: - gcryalg = GCRY_CIPHER_SERPENT128; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_192: - gcryalg = GCRY_CIPHER_SERPENT192; - break; - case QCRYPTO_CIPHER_ALG_SERPENT_256: - gcryalg = GCRY_CIPHER_SERPENT256; - break; - case QCRYPTO_CIPHER_ALG_TWOFISH_128: - gcryalg = GCRY_CIPHER_TWOFISH128; - break; - case QCRYPTO_CIPHER_ALG_TWOFISH_256: - gcryalg = GCRY_CIPHER_TWOFISH; - break; - default: + gcryalg = qcrypto_cipher_alg_to_gcry_alg(alg); + if (gcryalg == GCRY_CIPHER_NONE) { error_setg(errp, "Unsupported cipher algorithm %s", QCryptoCipherAlgorithm_str(alg)); return NULL; } - drv = &qcrypto_gcrypt_driver; - switch (mode) { - case QCRYPTO_CIPHER_MODE_ECB: - gcrymode = GCRY_CIPHER_MODE_ECB; - break; - case QCRYPTO_CIPHER_MODE_XTS: - gcrymode = GCRY_CIPHER_MODE_XTS; - break; - case QCRYPTO_CIPHER_MODE_CBC: - gcrymode = GCRY_CIPHER_MODE_CBC; - break; - case QCRYPTO_CIPHER_MODE_CTR: - drv = &qcrypto_gcrypt_ctr_driver; - gcrymode = GCRY_CIPHER_MODE_CTR; - break; - default: + gcrymode = qcrypto_cipher_mode_to_gcry_mode(mode); + if (gcrymode == GCRY_CIPHER_MODE_NONE) { error_setg(errp, "Unsupported cipher mode %s", QCryptoCipherMode_str(mode)); return NULL; } + if (mode == QCRYPTO_CIPHER_MODE_CTR) { + drv = &qcrypto_gcrypt_ctr_driver; + } else { + drv = &qcrypto_gcrypt_driver; + } + ctx = g_new0(QCryptoCipherGcrypt, 1); ctx->base.driver = drv; diff --git a/crypto/cipher-nettle.c.inc b/crypto/cipher-nettle.c.inc index 24cc61f87bfc4ae7ab8ff523a4105a67bace67a1..2654b439c18912bd65aba6b7da3431ece73d432c 100644 --- a/crypto/cipher-nettle.c.inc +++ b/crypto/cipher-nettle.c.inc @@ -33,6 +33,9 @@ #ifndef CONFIG_QEMU_PRIVATE_XTS #include #endif +#ifdef CONFIG_CRYPTO_SM4 +#include +#endif static inline bool qcrypto_length_check(size_t len, size_t blocksize, Error **errp) @@ -426,6 +429,30 @@ DEFINE_ECB_CBC_CTR_XTS(qcrypto_nettle_twofish, QCryptoNettleTwofish, TWOFISH_BLOCK_SIZE, twofish_encrypt_native, twofish_decrypt_native) +#ifdef CONFIG_CRYPTO_SM4 +typedef struct QCryptoNettleSm4 { + QCryptoCipher base; + struct sm4_ctx key[2]; +} QCryptoNettleSm4; + +static void sm4_encrypt_native(void *ctx, size_t length, + uint8_t *dst, const uint8_t *src) +{ + struct sm4_ctx *keys = ctx; + sm4_crypt(&keys[0], length, dst, src); +} + +static void sm4_decrypt_native(void *ctx, size_t length, + uint8_t *dst, const uint8_t *src) +{ + struct sm4_ctx *keys = ctx; + sm4_crypt(&keys[1], length, dst, src); +} + +DEFINE_ECB(qcrypto_nettle_sm4, + QCryptoNettleSm4, SM4_BLOCK_SIZE, + sm4_encrypt_native, sm4_decrypt_native) +#endif bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, QCryptoCipherMode mode) @@ -443,6 +470,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_ALG_TWOFISH_128: case QCRYPTO_CIPHER_ALG_TWOFISH_192: case QCRYPTO_CIPHER_ALG_TWOFISH_256: +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: +#endif break; default: return false; @@ -495,8 +525,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_des_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleDES, 1); @@ -521,8 +553,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_des3_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleDES3, 1); @@ -633,8 +667,10 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, case QCRYPTO_CIPHER_MODE_CTR: drv = &qcrypto_nettle_cast128_driver_ctr; break; - default: + case QCRYPTO_CIPHER_MODE_XTS: goto bad_cipher_mode; + default: + g_assert_not_reached(); } ctx = g_new0(QCryptoNettleCAST128, 1); @@ -701,6 +737,32 @@ static QCryptoCipher *qcrypto_cipher_ctx_new(QCryptoCipherAlgorithm alg, return &ctx->base; } +#ifdef CONFIG_CRYPTO_SM4 + case QCRYPTO_CIPHER_ALG_SM4: + { + QCryptoNettleSm4 *ctx; + const QCryptoCipherDriver *drv; + + switch (mode) { + case QCRYPTO_CIPHER_MODE_ECB: + drv = &qcrypto_nettle_sm4_driver_ecb; + break; + case QCRYPTO_CIPHER_MODE_CBC: + case QCRYPTO_CIPHER_MODE_CTR: + case QCRYPTO_CIPHER_MODE_XTS: + goto bad_cipher_mode; + default: + g_assert_not_reached(); + } + + ctx = g_new0(QCryptoNettleSm4, 1); + ctx->base.driver = drv; + sm4_set_encrypt_key(&ctx->key[0], key); + sm4_set_decrypt_key(&ctx->key[1], key); + + return &ctx->base; + } +#endif default: error_setg(errp, "Unsupported cipher algorithm %s", diff --git a/crypto/cipher.c b/crypto/cipher.c index 74b09a5b261bae00c01afa788dbca99017f4110e..5f512768ea3b05852960a65fadcf38b3ca6265d1 100644 --- a/crypto/cipher.c +++ b/crypto/cipher.c @@ -38,6 +38,9 @@ static const size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = { [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24, [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32, +#ifdef CONFIG_CRYPTO_SM4 + [QCRYPTO_CIPHER_ALG_SM4] = 16, +#endif }; static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = { @@ -53,6 +56,9 @@ static const size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = { [QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_192] = 16, [QCRYPTO_CIPHER_ALG_TWOFISH_256] = 16, +#ifdef CONFIG_CRYPTO_SM4 + [QCRYPTO_CIPHER_ALG_SM4] = 16, +#endif }; static const bool mode_need_iv[QCRYPTO_CIPHER_MODE__MAX] = { diff --git a/crypto/hash-gcrypt.c b/crypto/hash-gcrypt.c index 829e48258d326c3d8a9c7a5d2ef85a06a0dabfb2..8b305bd29aef78568ea5d09f6fca5e7af3bf3d87 100644 --- a/crypto/hash-gcrypt.c +++ b/crypto/hash-gcrypt.c @@ -33,13 +33,16 @@ static int qcrypto_hash_alg_map[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3, +#endif }; gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg) { if (alg < G_N_ELEMENTS(qcrypto_hash_alg_map) && qcrypto_hash_alg_map[alg] != GCRY_MD_NONE) { - return true; + return gcry_md_test_algo(qcrypto_hash_alg_map[alg]) == 0; } return false; } @@ -53,7 +56,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg, size_t *resultlen, Error **errp) { - int i, ret; + gcry_error_t ret; gcry_md_hd_t md; unsigned char *digest; @@ -66,7 +69,7 @@ qcrypto_gcrypt_hash_bytesv(QCryptoHashAlgorithm alg, ret = gcry_md_open(&md, qcrypto_hash_alg_map[alg], 0); - if (ret < 0) { + if (ret != 0) { error_setg(errp, "Unable to initialize hash algorithm: %s", gcry_strerror(ret)); diff --git a/crypto/hash-nettle.c b/crypto/hash-nettle.c index 1ca1a4106283f2bc3f11d362433aac69d9737c81..0c2f8ce86c1d7ff5d6eedee967dc9fa269ccfa9b 100644 --- a/crypto/hash-nettle.c +++ b/crypto/hash-nettle.c @@ -25,6 +25,9 @@ #include #include #include +#ifdef CONFIG_CRYPTO_SM3 +#include +#endif typedef void (*qcrypto_nettle_init)(void *ctx); typedef void (*qcrypto_nettle_write)(void *ctx, @@ -42,6 +45,9 @@ union qcrypto_hash_ctx { struct sha384_ctx sha384; struct sha512_ctx sha512; struct ripemd160_ctx ripemd160; +#ifdef CONFIG_CRYPTO_SM3 + struct sm3_ctx sm3; +#endif }; struct qcrypto_hash_alg { @@ -92,6 +98,14 @@ struct qcrypto_hash_alg { .result = (qcrypto_nettle_result)ripemd160_digest, .len = RIPEMD160_DIGEST_SIZE, }, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = { + .init = (qcrypto_nettle_init)sm3_init, + .write = (qcrypto_nettle_write)sm3_update, + .result = (qcrypto_nettle_result)sm3_digest, + .len = SM3_DIGEST_SIZE, + }, +#endif }; gboolean qcrypto_hash_supports(QCryptoHashAlgorithm alg) diff --git a/crypto/hash.c b/crypto/hash.c index b0f8228bdcb4a0b39e194065570065881ac51396..8f1502ce686ea6417d709f5d13d00009ab6253d7 100644 --- a/crypto/hash.c +++ b/crypto/hash.c @@ -30,6 +30,9 @@ static size_t qcrypto_hash_alg_size[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = 48, [QCRYPTO_HASH_ALG_SHA512] = 64, [QCRYPTO_HASH_ALG_RIPEMD160] = 20, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = 32, +#endif }; size_t qcrypto_hash_digest_len(QCryptoHashAlgorithm alg) diff --git a/crypto/hmac-gcrypt.c b/crypto/hmac-gcrypt.c index 0c6f9797114c26f56301da6d9a97de98014046bd..15926fccfafd10199f7dfd8ef4e0923a09908be3 100644 --- a/crypto/hmac-gcrypt.c +++ b/crypto/hmac-gcrypt.c @@ -26,6 +26,9 @@ static int qcrypto_hmac_alg_map[QCRYPTO_HASH_ALG__MAX] = { [QCRYPTO_HASH_ALG_SHA384] = GCRY_MAC_HMAC_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MAC_HMAC_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MAC_HMAC_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MAC_HMAC_SM3, +#endif }; typedef struct QCryptoHmacGcrypt QCryptoHmacGcrypt; @@ -37,7 +40,7 @@ bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg) { if (alg < G_N_ELEMENTS(qcrypto_hmac_alg_map) && qcrypto_hmac_alg_map[alg] != GCRY_MAC_NONE) { - return true; + return gcry_mac_test_algo(qcrypto_hmac_alg_map[alg]) == 0; } return false; diff --git a/crypto/hmac-nettle.c b/crypto/hmac-nettle.c index 1ad6c4f253069223b6a479bca25b25e858030918..3f103792abe481b25c2e6abdf3de455e0a0a3a68 100644 --- a/crypto/hmac-nettle.c +++ b/crypto/hmac-nettle.c @@ -38,6 +38,9 @@ struct QCryptoHmacNettle { struct hmac_sha256_ctx sha256_ctx; /* equals hmac_sha224_ctx */ struct hmac_sha512_ctx sha512_ctx; /* equals hmac_sha384_ctx */ struct hmac_ripemd160_ctx ripemd160_ctx; +#ifdef CONFIG_CRYPTO_SM3 + struct hmac_sm3_ctx sm3_ctx; +#endif } u; }; @@ -89,6 +92,14 @@ struct qcrypto_nettle_hmac_alg { .digest = (qcrypto_nettle_hmac_digest)hmac_ripemd160_digest, .len = RIPEMD160_DIGEST_SIZE, }, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = { + .setkey = (qcrypto_nettle_hmac_setkey)hmac_sm3_set_key, + .update = (qcrypto_nettle_hmac_update)hmac_sm3_update, + .digest = (qcrypto_nettle_hmac_digest)hmac_sm3_digest, + .len = SM3_DIGEST_SIZE, + }, +#endif }; bool qcrypto_hmac_supports(QCryptoHashAlgorithm alg) diff --git a/crypto/init.c b/crypto/init.c index fb7f1bff1053526a8b7c1f097ada5951f907ab8a..674d237fa9166c80f43bf4467df243f0b748581c 100644 --- a/crypto/init.c +++ b/crypto/init.c @@ -34,14 +34,11 @@ #include "crypto/random.h" -/* #define DEBUG_GNUTLS */ -#ifdef DEBUG_GNUTLS -static void qcrypto_gnutls_log(int level, const char *str) -{ - fprintf(stderr, "%d: %s", level, str); -} -#endif +/* + * To debug GNUTLS see env vars listed in + * https://gnutls.org/manual/html_node/Debugging-and-auditing.html + */ int qcrypto_init(Error **errp) { #ifdef CONFIG_GNUTLS @@ -53,10 +50,6 @@ int qcrypto_init(Error **errp) gnutls_strerror(ret)); return -1; } -#ifdef DEBUG_GNUTLS - gnutls_global_set_log_level(10); - gnutls_global_set_log_function(qcrypto_gnutls_log); -#endif #endif #ifdef CONFIG_GCRYPT diff --git a/crypto/pbkdf-gcrypt.c b/crypto/pbkdf-gcrypt.c index a8d8e64f4d46bbb65f8d0ef71cbe20c08bca31ed..09b38d0d6e1cc8c6640372309afac2450767ed54 100644 --- a/crypto/pbkdf-gcrypt.c +++ b/crypto/pbkdf-gcrypt.c @@ -33,6 +33,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash) case QCRYPTO_HASH_ALG_SHA384: case QCRYPTO_HASH_ALG_SHA512: case QCRYPTO_HASH_ALG_RIPEMD160: +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: +#endif return true; default: return false; @@ -54,6 +57,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, [QCRYPTO_HASH_ALG_SHA384] = GCRY_MD_SHA384, [QCRYPTO_HASH_ALG_SHA512] = GCRY_MD_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = GCRY_MD_RMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = GCRY_MD_SM3, +#endif }; int ret; diff --git a/crypto/pbkdf-nettle.c b/crypto/pbkdf-nettle.c index d6293c25a13b98880be6110e577f4d35a7af711c..5fea570bd3313b56c4440e939bae58610ae175cb 100644 --- a/crypto/pbkdf-nettle.c +++ b/crypto/pbkdf-nettle.c @@ -34,6 +34,9 @@ bool qcrypto_pbkdf2_supports(QCryptoHashAlgorithm hash) case QCRYPTO_HASH_ALG_SHA384: case QCRYPTO_HASH_ALG_SHA512: case QCRYPTO_HASH_ALG_RIPEMD160: +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: +#endif return true; default: return false; @@ -55,6 +58,9 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, struct hmac_sha384_ctx sha384; struct hmac_sha512_ctx sha512; struct hmac_ripemd160_ctx ripemd160; +#ifdef CONFIG_CRYPTO_SM3 + struct hmac_sm3_ctx sm3; +#endif } ctx; if (iterations > UINT_MAX) { @@ -106,6 +112,13 @@ int qcrypto_pbkdf2(QCryptoHashAlgorithm hash, PBKDF2(&ctx.ripemd160, hmac_ripemd160_update, hmac_ripemd160_digest, RIPEMD160_DIGEST_SIZE, iterations, nsalt, salt, nout, out); break; +#ifdef CONFIG_CRYPTO_SM3 + case QCRYPTO_HASH_ALG_SM3: + hmac_sm3_set_key(&ctx.sm3, nkey, key); + PBKDF2(&ctx.sm3, hmac_sm3_update, hmac_sm3_digest, + SM3_DIGEST_SIZE, iterations, nsalt, salt, nout, out); + break; +#endif default: error_setg_errno(errp, ENOSYS, diff --git a/crypto/pbkdf.c b/crypto/pbkdf.c index 8d198c152cf2ee06ea340d6b9adea4cbfd001136..d1c06ef3ed3bbe4f4509f624e6ba9dc59e71b72e 100644 --- a/crypto/pbkdf.c +++ b/crypto/pbkdf.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "qemu/thread.h" #include "qapi/error.h" #include "crypto/pbkdf.h" #ifndef _WIN32 @@ -85,12 +86,28 @@ static int qcrypto_pbkdf2_get_thread_cpu(unsigned long long *val_ms, #endif } -uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, - const uint8_t *key, size_t nkey, - const uint8_t *salt, size_t nsalt, - size_t nout, - Error **errp) +typedef struct CountItersData { + QCryptoHashAlgorithm hash; + const uint8_t *key; + size_t nkey; + const uint8_t *salt; + size_t nsalt; + size_t nout; + uint64_t iterations; + Error **errp; +} CountItersData; + +static void *threaded_qcrypto_pbkdf2_count_iters(void *data) { + CountItersData *iters_data = (CountItersData *) data; + QCryptoHashAlgorithm hash = iters_data->hash; + const uint8_t *key = iters_data->key; + size_t nkey = iters_data->nkey; + const uint8_t *salt = iters_data->salt; + size_t nsalt = iters_data->nsalt; + size_t nout = iters_data->nout; + Error **errp = iters_data->errp; + uint64_t ret = -1; g_autofree uint8_t *out = g_new(uint8_t, nout); uint64_t iterations = (1 << 15); @@ -114,7 +131,10 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, delta_ms = end_ms - start_ms; - if (delta_ms > 500) { + if (delta_ms == 0) { /* sanity check */ + error_setg(errp, "Unable to get accurate CPU usage"); + goto cleanup; + } else if (delta_ms > 500) { break; } else if (delta_ms < 100) { iterations = iterations * 10; @@ -129,5 +149,24 @@ uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, cleanup: memset(out, 0, nout); - return ret; + iters_data->iterations = ret; + return NULL; +} + +uint64_t qcrypto_pbkdf2_count_iters(QCryptoHashAlgorithm hash, + const uint8_t *key, size_t nkey, + const uint8_t *salt, size_t nsalt, + size_t nout, + Error **errp) +{ + CountItersData data = { + hash, key, nkey, salt, nsalt, nout, 0, errp + }; + QemuThread thread; + + qemu_thread_create(&thread, "pbkdf2", threaded_qcrypto_pbkdf2_count_iters, + &data, QEMU_THREAD_JOINABLE); + qemu_thread_join(&thread); + + return data.iterations; } diff --git a/crypto/tlscredspsk.c b/crypto/tlscredspsk.c index 546cad1c5a49d25d20de84ca4613681e612c5810..0d6b71a37cf273b8d2ce3e3e447220572f0d5e17 100644 --- a/crypto/tlscredspsk.c +++ b/crypto/tlscredspsk.c @@ -243,6 +243,7 @@ qcrypto_tls_creds_psk_finalize(Object *obj) QCryptoTLSCredsPSK *creds = QCRYPTO_TLS_CREDS_PSK(obj); qcrypto_tls_creds_psk_unload(creds); + g_free(creds->username); } static void diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 6f81df92bcaba790477aff1ccb51048409331950..5636e9cf1d78bfb859cd8282778e777ac6fe18ea 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,12 +11,12 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration + migration/index multi-process reset s390-cpu-topology s390-dasd-ipl tracing - vfio-migration + vfio-iommufd writing-monitor-commands virtio-backends diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst new file mode 100644 index 0000000000000000000000000000000000000000..d7c34a30149d68065cb547637e878777863586dd --- /dev/null +++ b/docs/devel/migration/best-practices.rst @@ -0,0 +1,48 @@ +============== +Best practices +============== + +Debugging +========= + +The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. + +Example usage: + +.. code-block:: shell + + $ qemu-system-x86_64 -display none -monitor stdio + (qemu) migrate "exec:cat > mig" + (qemu) q + $ ./scripts/analyze-migration.py -f mig + { + "ram (3)": { + "section sizes": { + "pc.ram": "0x0000000008000000", + ... + +See also ``analyze-migration.py -h`` help for more options. + +Firmware +======== + +Migration migrates the copies of RAM and ROM, and thus when running +on the destination it includes the firmware from the source. Even after +resetting a VM, the old firmware is used. Only once QEMU has been restarted +is the new firmware in use. + +- Changes in firmware size can cause changes in the required RAMBlock size + to hold the firmware and thus migration can fail. In practice it's best + to pad firmware images to convenient powers of 2 with plenty of space + for growth. + +- Care should be taken with device emulation code so that newer + emulation code can work with older firmware to allow forward migration. + +- Care should be taken with newer firmware so that backward migration + to older systems with older device emulation code will work. + +In some cases it may be best to tie specific firmware versions to specific +versioned machine types to cut down on the combinations that will need +support. This is also useful when newer versions of firmware outgrow +the padding. diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst new file mode 100644 index 0000000000000000000000000000000000000000..5a5417ef069ef38f2039eb94dc5fcc6370e69af3 --- /dev/null +++ b/docs/devel/migration/compatibility.rst @@ -0,0 +1,517 @@ +Backwards compatibility +======================= + +How backwards compatibility works +--------------------------------- + +When we do migration, we have two QEMU processes: the source and the +target. There are two cases, they are the same version or they are +different versions. The easy case is when they are the same version. +The difficult one is when they are different versions. + +There are two things that are different, but they have very similar +names and sometimes get confused: + +- QEMU version +- machine type version + +Let's start with a practical example, we start with: + +- qemu-system-x86_64 (v5.2), from now on qemu-5.2. +- qemu-system-x86_64 (v5.1), from now on qemu-5.1. + +Related to this are the "latest" machine types defined on each of +them: + +- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 +- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 + +First of all, migration is only supposed to work if you use the same +machine type in both source and destination. The QEMU hardware +configuration needs to be the same also on source and destination. +Most aspects of the backend configuration can be changed at will, +except for a few cases where the backend features influence frontend +device feature exposure. But that is not relevant for this section. + +I am going to list the number of combinations that we can have. Let's +start with the trivial ones, QEMU is the same on source and +destination: + +1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 + + This is the latest QEMU with the latest machine type. + This have to work, and if it doesn't work it is a bug. + +2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + Exactly the same case than the previous one, but for 5.1. + Nothing to see here either. + +This are the easiest ones, we will not talk more about them in this +section. + +Now we start with the more interesting cases. Consider the case where +we have the same QEMU version in both sides (qemu-5.2) but we are using +the latest machine type for that version (pc-5.2) but one of an older +QEMU version, in this case pc-5.1. + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + It needs to use the definition of pc-5.1 and the devices as they + were configured on 5.1, but this should be easy in the sense that + both sides are the same QEMU and both sides have exactly the same + idea of what the pc-5.1 machine is. + +4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 + + This combination is not possible as the qemu-5.1 doesn't understand + pc-5.2 machine type. So nothing to worry here. + +Now it comes the interesting ones, when both QEMU processes are +different. Notice also that the machine type needs to be pc-5.1, +because we have the limitation than qemu-5.1 doesn't know pc-5.2. So +the possible cases are: + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + This migration is known as newer to older. We need to make sure + when we are developing 5.2 we need to take care about not to break + migration to qemu-5.1. Notice that we can't make updates to + qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is + in qemu-5.2 side to make the relevant changes. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + This migration is known as older to newer. We need to make sure + than we are able to receive migrations from qemu-5.1. The problem is + similar to the previous one. + +If qemu-5.1 and qemu-5.2 were the same, there will not be any +compatibility problems. But the reason that we create qemu-5.2 is to +get new features, devices, defaults, etc. + +If we get a device that has a new feature, or change a default value, +we have a problem when we try to migrate between different QEMU +versions. + +So we need a way to tell qemu-5.2 that when we are using machine type +pc-5.1, it needs to **not** use the feature, to be able to migrate to +real qemu-5.1. + +And the equivalent part when migrating from qemu-5.1 to qemu-5.2. +qemu-5.2 has to expect that it is not going to get data for the new +feature, because qemu-5.1 doesn't know about it. + +How do we tell QEMU about these device feature changes? In +hw/core/machine.c:hw_compat_X_Y arrays. + +If we change a default value, we need to put back the old value on +that array. And the device, during initialization needs to look at +that array to see what value it needs to get for that feature. And +what are we going to put in that array, the value of a property. + +To create a property for a device, we need to use one of the +DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the +macros that exist. With it, we set the default value for that +property, and that is what it is going to get in the latest released +version. But if we want a different value for a previous version, we +can change that in the hw_compat_X_Y arrays. + +hw_compat_X_Y is an array of registers that have the format: + +- name_device +- name_property +- value + +Let's see a practical example. + +In qemu-5.2 virtio-blk-device got multi queue support. This is a +change that is not backward compatible. In qemu-5.1 it has one +queue. In qemu-5.2 it has the same number of queues as the number of +cpus in the system. + +When we are doing migration, if we migrate from a device that has 4 +queues to a device that have only one queue, we don't know where to +put the extra information for the other 3 queues, and we fail +migration. + +Similar problem when we migrate from qemu-5.1 that has only one queue +to qemu-5.2, we only sent information for one queue, but destination +has 4, and we have 3 queues that are not properly initialized and +anything can happen. + +So, how can we address this problem. Easy, just convince qemu-5.2 +that when it is running pc-5.1, it needs to set the number of queues +for virtio-blk-devices to 1. + +That way we fix the cases 5 and 6. + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + qemu-5.2 -M pc-5.1 sets number of queues to be 1. + qemu-5.1 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + qemu-5.1 -M pc-5.1 sets number of queues to be 1. + qemu-5.2 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +And now the other interesting case, case 3. In this case we have: + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + Here we have the same QEMU in both sides. So it doesn't matter a + lot if we have set the number of queues to 1 or not, because + they are the same. + + WRONG! + + Think what happens if we do one of this double migrations: + + A -> migrates -> B -> migrates -> C + + where: + + A: qemu-5.1 -M pc-5.1 + B: qemu-5.2 -M pc-5.1 + C: qemu-5.2 -M pc-5.1 + + migration A -> B is case 6, so number of queues needs to be 1. + + migration B -> C is case 3, so we don't care. But actually we + care because we haven't started the guest in qemu-5.2, it came + migrated from qemu-5.1. So to be in the safe place, we need to + always use number of queues 1 when we are using pc-5.1. + +Now, how was this done in reality? The following commit shows how it +was done:: + + commit 9445e1e15e66c19e42bea942ba810db28052cd05 + Author: Stefan Hajnoczi + Date: Tue Aug 18 15:33:47 2020 +0100 + + virtio-blk-pci: default num_queues to -smp N + +The relevant parts for migration are:: + + @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { + #endif + DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, + true), + - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), + + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, + + VIRTIO_BLK_AUTO_NUM_QUEUES), + DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), + +It changes the default value of num_queues. But it fishes it for old +machine types to have the right value:: + + @@ -31,6 +31,7 @@ + GlobalProperty hw_compat_5_1[] = { + ... + + { "virtio-blk-device", "num-queues", "1"}, + ... + }; + +A device with different features on both sides +---------------------------------------------- + +Let's assume that we are using the same QEMU binary on both sides, +just to make the things easier. But we have a device that has +different features on both sides of the migration. That can be +because the devices are different, because the kernel driver of both +devices have different features, whatever. + +How can we get this to work with migration. The way to do that is +"theoretically" easy. You have to get the features that the device +has in the source of the migration. The features that the device has +on the target of the migration, you get the intersection of the +features of both sides, and that is the way that you should launch +QEMU. + +Notice that this is not completely related to QEMU. The most +important thing here is that this should be handled by the managing +application that launches QEMU. If QEMU is configured correctly, the +migration will succeed. + +That said, actually doing it is complicated. Almost all devices are +bad at being able to be launched with only some features enabled. +With one big exception: cpus. + +You can read the documentation for QEMU x86 cpu models here: + +https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html + +See when they talk about migration they recommend that one chooses the +newest cpu model that is supported for all cpus. + +Let's say that we have: + +Host A: + +Device X has the feature Y + +Host B: + +Device X has not the feature Y + +If we try to migrate without any care from host A to host B, it will +fail because when migration tries to load the feature Y on +destination, it will find that the hardware is not there. + +Doing this would be the equivalent of doing with cpus: + +Host A: + +$ qemu-system-x86_64 -cpu host + +Host B: + +$ qemu-system-x86_64 -cpu host + +When both hosts have different cpu features this is guaranteed to +fail. Especially if Host B has less features than host A. If host A +has less features than host B, sometimes it works. Important word of +last sentence is "sometimes". + +So, forgetting about cpu models and continuing with the -cpu host +example, let's see that the differences of the cpus is that Host A and +B have the following features: + +Features: 'pcid' 'stibp' 'taa-no' +Host A: X X +Host B: X + +And we want to migrate between them, the way configure both QEMU cpu +will be: + +Host A: + +$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off + +Host B: + +$ qemu-system-x86_64 -cpu host,taa-no=off + +And you would be able to migrate between them. It is responsibility +of the management application or of the user to make sure that the +configuration is correct. QEMU doesn't know how to look at this kind +of features in general. + +Notice that we don't recommend to use -cpu host for migration. It is +used in this example because it makes the example simpler. + +Other devices have worse control about individual features. If they +want to be able to migrate between hosts that show different features, +the device needs a way to configure which ones it is going to use. + +In this section we have considered that we are using the same QEMU +binary in both sides of the migration. If we use different QEMU +versions process, then we need to have into account all other +differences and the examples become even more complicated. + +How to mitigate when we have a backward compatibility error +----------------------------------------------------------- + +We broke migration for old machine types continuously during +development. But as soon as we find that there is a problem, we fix +it. The problem is what happens when we detect after we have done a +release that something has gone wrong. + +Let see how it worked with one example. + +After the release of qemu-8.0 we found a problem when doing migration +of the machine type pc-7.2. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration fails + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration fails + +So clearly something fails when migration between qemu-7.2 and +qemu-8.0 with machine type pc-7.2. The error messages, and git bisect +pointed to this commit. + +In qemu-8.0 we got this commit:: + + commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 + Author: Jonathan Cameron + Date: Thu Mar 2 13:37:02 2023 +0000 + + hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register + + +The relevant bits of the commit for our example are this ones:: + + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +The patch changes how we configure PCI space for AER. But QEMU fails +when the PCI space configuration is different between source and +destination. + +The following commit shows how this got fixed:: + + commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f + Author: Leonardo Bras + Date: Tue May 2 21:27:02 2023 -0300 + + hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 + + [...] + +The relevant parts of the fix in QEMU are as follow: + +First, we create a new property for the device to be able to configure +the old behaviour or the new behaviour:: + + diff --git a/hw/pci/pci.c b/hw/pci/pci.c + index 8a87ccc8b0..5153ad63d6 100644 + --- a/hw/pci/pci.c + +++ b/hw/pci/pci.c + @@ -79,6 +79,8 @@ static Property pci_props[] = { + DEFINE_PROP_STRING("failover_pair_id", PCIDevice, + failover_pair_id), + DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), + + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, + + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), + DEFINE_PROP_END_OF_LIST() + }; + +Notice that we enable the feature for new machine types. + +Now we see how the fix is done. This is going to depend on what kind +of breakage happens, but in this case it is quite simple:: + + diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c + index 103667c368..374d593ead 100644 + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, + uint16_t offset, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_MASK_DEFAULT); + - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_SUPPORTED); + + + + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + } + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +I.e. If the property bit is enabled, we configure it as we did for +qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. + +And now, everything that is missing is disabling the feature for old +machine types:: + + diff --git a/hw/core/machine.c b/hw/core/machine.c + index 47a34841a5..07f763eb2e 100644 + --- a/hw/core/machine.c + +++ b/hw/core/machine.c + @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { + { "e1000e", "migrate-timadj", "off" }, + { "virtio-mem", "x-early-migration", "false" }, + { "migration", "x-preempt-pre-7-2", "true" }, + + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, + }; + const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); + +And now, when qemu-8.0.1 is released with this fix, all combinations +are going to work as supposed. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) + +So the normality has been restored and everything is ok, no? + +Not really, now our matrix is much bigger. We started with the easy +cases, migration from the same version to the same version always +works: + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +Now the interesting ones. When the QEMU processes versions are +different. For the 1st set, their fail and we can do nothing, both +versions are released and we can't change anything. + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +This two are the ones that work. The whole point of making the +change in qemu-8.0.1 release was to fix this issue: + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +But now we found that qemu-8.0 neither can migrate to qemu-7.2 not +qemu-8.0.1. + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + +So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to +anything except to qemu-8.0. + +Can we do better? + +Yeap. If we know that we are going to do this migration: + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +We can launch the appropriate devices with:: + + --device...,x-pci-e-err-unc-mask=on + +And now we can receive a migration from 8.0. And from now on, we can +do that migration to new machine types if we remember to enable that +property for pc-7.2. Notice that we need to remember, it is not +enough to know that the source of the migration is qemu-8.0. Think of +this example: + +$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 + +In the second migration, the source is not qemu-8.0, but we still have +that "problem" and have that property enabled. Notice that we need to +continue having this mark/property until we have this machine +rebooted. But it is not a normal reboot (that don't reload QEMU) we +need the machine to poweroff/poweron on a fixed QEMU. And from now +on we can use the proper real machine. diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f32329d5fda2e239d378c0bc4662fd8cf6d7d23 --- /dev/null +++ b/docs/devel/migration/dirty-limit.rst @@ -0,0 +1,71 @@ +Dirty limit +=========== + +The dirty limit, short for dirty page rate upper limit, is a new capability +introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM +dirty ring to throttle down the guest during live migration. + +The algorithm framework is as follows: + +:: + + ------------------------------------------------------------------------------ + main --------------> throttle thread ------------> PREPARE(1) <-------- + thread \ | | + \ | | + \ V | + -\ CALCULATE(2) | + \ | | + \ | | + \ V | + \ SET PENALTY(3) ----- + -\ | + \ | + \ V + -> virtual CPU thread -------> ACCEPT PENALTY(4) + ------------------------------------------------------------------------------ + +When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, +the QEMU main thread starts the throttle thread. The throttle thread, once +launched, executes the loop, which consists of three steps: + + - PREPARE (1) + + The entire work of PREPARE (1) is preparation for the second stage, + CALCULATE(2), as the name implies. It involves preparing the dirty + page rate value and the corresponding upper limit of the VM: + The dirty page rate is calculated via the KVM dirty ring mechanism, + which tells QEMU how many dirty pages a virtual CPU has had since the + last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper + limit is specified by caller, therefore fetch it directly. + + - CALCULATE (2) + + Calculate a suitable sleep period for each virtual CPU, which will be + used to determine the penalty for the target virtual CPU. The + computation must be done carefully in order to reduce the dirty page + rate progressively down to the upper limit without oscillation. To + achieve this, two strategies are provided: the first is to add or + subtract sleep time based on the ratio of the current dirty page rate + to the limit, which is used when the current dirty page rate is far + from the limit; the second is to add or subtract a fixed time when + the current dirty page rate is close to the limit. + + - SET PENALTY (3) + + Set the sleep time for each virtual CPU that should be penalized based + on the results of the calculation supplied by step CALCULATE (2). + +After completing the three above stages, the throttle thread loops back +to step PREPARE (1) until the dirty limit is reached. + +On the other hand, each virtual CPU thread reads the sleep duration and +sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that +is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will +obviously exit to the path and get penalized, whereas virtual CPUs involved +with read processes will not. + +In summary, thanks to the KVM dirty ring technology, the dirty limit +algorithm will restrict virtual CPUs as needed to keep their dirty page +rate inside the limit. This leads to more steady reading performance during +live migration and can aid in improving large guest responsiveness. diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst new file mode 100644 index 0000000000000000000000000000000000000000..7c5ce9e79d4eafe18c9c01b83ddccd5fcc28abbf --- /dev/null +++ b/docs/devel/migration/features.rst @@ -0,0 +1,15 @@ +Migration features +================== + +Migration has plenty of features to support different use cases. + +.. toctree:: + :maxdepth: 2 + + postcopy + dirty-limit + vfio + virtio + qpl-compression + uadk-compression + qatzip-compression diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..2aa294d6314bd4cb18623e0caf9adc6d93fd030e --- /dev/null +++ b/docs/devel/migration/index.rst @@ -0,0 +1,13 @@ +Migration +========= + +This is the main entry for QEMU migration documentations. It explains how +QEMU live migration works. + +.. toctree:: + :maxdepth: 2 + + main + features + compatibility + best-practices diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst similarity index 38% rename from docs/devel/migration.rst rename to docs/devel/migration/main.rst index ec55089b2530a0296f858b1f078b557d12e154b7..396c7c51ca8552e7d542201bd21da54d1e96439b 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration/main.rst @@ -1,6 +1,6 @@ -========= -Migration -========= +=================== +Migration framework +=================== QEMU has code to load/save the state of the guest that it is running. These are two complementary operations. Saving the state just does @@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -Debugging -========= - -The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. - -Example usage: - -.. code-block:: shell - - $ qemu-system-x86_64 -display none -monitor stdio - (qemu) migrate "exec:cat > mig" - (qemu) q - $ ./scripts/analyze-migration.py -f mig - { - "ram (3)": { - "section sizes": { - "pc.ram": "0x0000000008000000", - ... - -See also ``analyze-migration.py -h`` help for more options. - Common infrastructure ===================== @@ -594,921 +573,3 @@ path. Return path - opened by main thread, written by main thread AND postcopy thread (protected by rp_mutex) -Dirty limit -===================== -The dirty limit, short for dirty page rate upper limit, is a new capability -introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM -dirty ring to throttle down the guest during live migration. - -The algorithm framework is as follows: - -:: - - ------------------------------------------------------------------------------ - main --------------> throttle thread ------------> PREPARE(1) <-------- - thread \ | | - \ | | - \ V | - -\ CALCULATE(2) | - \ | | - \ | | - \ V | - \ SET PENALTY(3) ----- - -\ | - \ | - \ V - -> virtual CPU thread -------> ACCEPT PENALTY(4) - ------------------------------------------------------------------------------ - -When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, -the QEMU main thread starts the throttle thread. The throttle thread, once -launched, executes the loop, which consists of three steps: - - - PREPARE (1) - - The entire work of PREPARE (1) is preparation for the second stage, - CALCULATE(2), as the name implies. It involves preparing the dirty - page rate value and the corresponding upper limit of the VM: - The dirty page rate is calculated via the KVM dirty ring mechanism, - which tells QEMU how many dirty pages a virtual CPU has had since the - last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper - limit is specified by caller, therefore fetch it directly. - - - CALCULATE (2) - - Calculate a suitable sleep period for each virtual CPU, which will be - used to determine the penalty for the target virtual CPU. The - computation must be done carefully in order to reduce the dirty page - rate progressively down to the upper limit without oscillation. To - achieve this, two strategies are provided: the first is to add or - subtract sleep time based on the ratio of the current dirty page rate - to the limit, which is used when the current dirty page rate is far - from the limit; the second is to add or subtract a fixed time when - the current dirty page rate is close to the limit. - - - SET PENALTY (3) - - Set the sleep time for each virtual CPU that should be penalized based - on the results of the calculation supplied by step CALCULATE (2). - -After completing the three above stages, the throttle thread loops back -to step PREPARE (1) until the dirty limit is reached. - -On the other hand, each virtual CPU thread reads the sleep duration and -sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that -is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will -obviously exit to the path and get penalized, whereas virtual CPUs involved -with read processes will not. - -In summary, thanks to the KVM dirty ring technology, the dirty limit -algorithm will restrict virtual CPUs as needed to keep their dirty page -rate inside the limit. This leads to more steady reading performance during -live migration and can aid in improving large guest responsiveness. - -Postcopy -======== - -'Postcopy' migration is a way to deal with migrations that refuse to converge -(or take too long to converge) its plus side is that there is an upper bound on -the amount of migration traffic and time it takes, the down side is that during -the postcopy phase, a failure of *either* side causes the guest to be lost. - -In postcopy the destination CPUs are started before all the memory has been -transferred, and accesses to pages that are yet to be transferred cause -a fault that's translated by QEMU into a request to the source QEMU. - -Postcopy can be combined with precopy (i.e. normal migration) so that if precopy -doesn't finish in a given time the switch is made to postcopy. - -Enabling postcopy ------------------ - -To enable postcopy, issue this command on the monitor (both source and -destination) prior to the start of migration: - -``migrate_set_capability postcopy-ram on`` - -The normal commands are then used to start a migration, which is still -started in precopy mode. Issuing: - -``migrate_start_postcopy`` - -will now cause the transition from precopy to postcopy. -It can be issued immediately after migration is started or any -time later on. Issuing it after the end of a migration is harmless. - -Blocktime is a postcopy live migration metric, intended to show how -long the vCPU was in state of interruptible sleep due to pagefault. -That metric is calculated both for all vCPUs as overlapped value, and -separately for each vCPU. These values are calculated on destination -side. To enable postcopy blocktime calculation, enter following -command on destination monitor: - -``migrate_set_capability postcopy-blocktime on`` - -Postcopy blocktime can be retrieved by query-migrate qmp command. -postcopy-blocktime value of qmp command will show overlapped blocking -time for all vCPU, postcopy-vcpu-blocktime will show list of blocking -time per vCPU. - -.. note:: - During the postcopy phase, the bandwidth limits set using - ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that - the destination is waiting for). - -Postcopy device transfer ------------------------- - -Loading of device data may cause the device emulation to access guest RAM -that may trigger faults that have to be resolved by the source, as such -the migration stream has to be able to respond with page data *during* the -device load, and hence the device data has to be read from the stream completely -before the device load begins to free the stream up. This is achieved by -'packaging' the device data into a blob that's read in one go. - -Source behaviour ----------------- - -Until postcopy is entered the migration stream is identical to normal -precopy, except for the addition of a 'postcopy advise' command at -the beginning, to tell the destination that postcopy might happen. -When postcopy starts the source sends the page discard data and then -forms the 'package' containing: - - - Command: 'postcopy listen' - - The device state - - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - - Command: 'postcopy run' - -The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the -contents are formatted in the same way as the main migration stream. - -During postcopy the source scans the list of dirty pages and sends them -to the destination without being requested (in much the same way as precopy), -however when a page request is received from the destination, the dirty page -scanning restarts from the requested location. This causes requested pages -to be sent quickly, and also causes pages directly after the requested page -to be sent quickly in the hope that those pages are likely to be used -by the destination soon. - -Destination behaviour ---------------------- - -Initially the destination looks the same as precopy, with a single thread -reading the migration stream; the 'postcopy advise' and 'discard' commands -are processed to change the way RAM is managed, but don't affect the stream -processing. - -:: - - ------------------------------------------------------------------------------ - 1 2 3 4 5 6 7 - main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) - thread | | - | (page request) - | \___ - v \ - listen thread: --- page -- page -- page -- page -- page -- - - a b c - ------------------------------------------------------------------------------ - -- On receipt of ``CMD_PACKAGED`` (1) - - All the data associated with the package - the ( ... ) section in the diagram - - is read into memory, and the main thread recurses into qemu_loadvm_state_main - to process the contents of the package (2) which contains commands (3,6) and - devices (4...) - -- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) - - a new thread (a) is started that takes over servicing the migration stream, - while the main thread carries on loading the package. It loads normal - background page data (b) but if during a device load a fault happens (5) - the returned page (c) is loaded by the listen thread allowing the main - threads device load to carry on. - -- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) - - letting the destination CPUs start running. At the end of the - ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and - is no longer used by migration, while the listen thread carries on servicing - page data until the end of migration. - -Postcopy Recovery ------------------ - -Comparing to precopy, postcopy is special on error handlings. When any -error happens (in this case, mostly network errors), QEMU cannot easily -fail a migration because VM data resides in both source and destination -QEMU instances. On the other hand, when issue happens QEMU on both sides -will go into a paused state. It'll need a recovery phase to continue a -paused postcopy migration. - -The recovery phase normally contains a few steps: - - - When network issue occurs, both QEMU will go into PAUSED state - - - When the network is recovered (or a new network is provided), the admin - can setup the new channel for migration using QMP command - 'migrate-recover' on destination node, preparing for a resume. - - - On source host, the admin can continue the interrupted postcopy - migration using QMP command 'migrate' with resume=true flag set. - - - After the connection is re-established, QEMU will continue the postcopy - migration on both sides. - -During a paused postcopy migration, the VM can logically still continue -running, and it will not be impacted from any page access to pages that -were already migrated to destination VM before the interruption happens. -However, if any of the missing pages got accessed on destination VM, the VM -thread will be halted waiting for the page to be migrated, it means it can -be halted until the recovery is complete. - -The impact of accessing missing pages can be relevant to different -configurations of the guest. For example, when with async page fault -enabled, logically the guest can proactively schedule out the threads -accessing missing pages. - -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - -Postcopy with hugepages ------------------------ - -Postcopy now works with hugetlbfs backed memory: - - a) The linux kernel on the destination must support userfault on hugepages. - b) The huge-page configuration on the source and destination VMs must be - identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal - RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using ``-mem-prealloc`` enforces the allocation using hugepages. - d) Care should be taken with the size of hugepage used; postcopy with 2MB - hugepages works well, however 1GB hugepages are likely to be problematic - since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, - and until the full page is transferred the destination thread is blocked. - -Postcopy with shared memory ---------------------------- - -Postcopy migration with shared memory needs explicit support from the other -processes that share memory and from QEMU. There are restrictions on the type of -memory that userfault can support shared. - -The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` -(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` -for hugetlbfs which may be a problem in some configurations). - -The vhost-user code in QEMU supports clients that have Postcopy support, -and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes -to support postcopy. - -The client needs to open a userfaultfd and register the areas -of memory that it maps with userfault. The client must then pass the -userfaultfd back to QEMU together with a mapping table that allows -fault addresses in the clients address space to be converted back to -RAMBlock/offsets. The client's userfaultfd is added to the postcopy -fault-thread and page requests are made on behalf of the client by QEMU. -QEMU performs 'wake' operations on the client's userfaultfd to allow it -to continue after a page has arrived. - -.. note:: - There are two future improvements that would be nice: - a) Some way to make QEMU ignorant of the addresses in the clients - address space - b) Avoiding the need for QEMU to perform ufd-wake calls after the - pages have arrived - -Retro-fitting postcopy to existing clients is possible: - a) A mechanism is needed for the registration with userfault as above, - and the registration needs to be coordinated with the phases of - postcopy. In vhost-user extra messages are added to the existing - control channel. - b) Any thread that can block due to guest memory accesses must be - identified and the implication understood; for example if the - guest memory access is made while holding a lock then all other - threads waiting for that lock will also be blocked. - -Postcopy Preemption Mode ------------------------- - -Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -allows urgent pages (those got page fault requested from destination QEMU -explicitly) to be sent in a separate preempt channel, rather than queued in -the background migration channel. Anyone who cares about latencies of page -faults during a postcopy migration should enable this feature. By default, -it's not enabled. - -Firmware -======== - -Migration migrates the copies of RAM and ROM, and thus when running -on the destination it includes the firmware from the source. Even after -resetting a VM, the old firmware is used. Only once QEMU has been restarted -is the new firmware in use. - -- Changes in firmware size can cause changes in the required RAMBlock size - to hold the firmware and thus migration can fail. In practice it's best - to pad firmware images to convenient powers of 2 with plenty of space - for growth. - -- Care should be taken with device emulation code so that newer - emulation code can work with older firmware to allow forward migration. - -- Care should be taken with newer firmware so that backward migration - to older systems with older device emulation code will work. - -In some cases it may be best to tie specific firmware versions to specific -versioned machine types to cut down on the combinations that will need -support. This is also useful when newer versions of firmware outgrow -the padding. - - -Backwards compatibility -======================= - -How backwards compatibility works ---------------------------------- - -When we do migration, we have two QEMU processes: the source and the -target. There are two cases, they are the same version or they are -different versions. The easy case is when they are the same version. -The difficult one is when they are different versions. - -There are two things that are different, but they have very similar -names and sometimes get confused: - -- QEMU version -- machine type version - -Let's start with a practical example, we start with: - -- qemu-system-x86_64 (v5.2), from now on qemu-5.2. -- qemu-system-x86_64 (v5.1), from now on qemu-5.1. - -Related to this are the "latest" machine types defined on each of -them: - -- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 -- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 - -First of all, migration is only supposed to work if you use the same -machine type in both source and destination. The QEMU hardware -configuration needs to be the same also on source and destination. -Most aspects of the backend configuration can be changed at will, -except for a few cases where the backend features influence frontend -device feature exposure. But that is not relevant for this section. - -I am going to list the number of combinations that we can have. Let's -start with the trivial ones, QEMU is the same on source and -destination: - -1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 - - This is the latest QEMU with the latest machine type. - This have to work, and if it doesn't work it is a bug. - -2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - Exactly the same case than the previous one, but for 5.1. - Nothing to see here either. - -This are the easiest ones, we will not talk more about them in this -section. - -Now we start with the more interesting cases. Consider the case where -we have the same QEMU version in both sides (qemu-5.2) but we are using -the latest machine type for that version (pc-5.2) but one of an older -QEMU version, in this case pc-5.1. - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - It needs to use the definition of pc-5.1 and the devices as they - were configured on 5.1, but this should be easy in the sense that - both sides are the same QEMU and both sides have exactly the same - idea of what the pc-5.1 machine is. - -4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 - - This combination is not possible as the qemu-5.1 doesn't understand - pc-5.2 machine type. So nothing to worry here. - -Now it comes the interesting ones, when both QEMU processes are -different. Notice also that the machine type needs to be pc-5.1, -because we have the limitation than qemu-5.1 doesn't know pc-5.2. So -the possible cases are: - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - This migration is known as newer to older. We need to make sure - when we are developing 5.2 we need to take care about not to break - migration to qemu-5.1. Notice that we can't make updates to - qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is - in qemu-5.2 side to make the relevant changes. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - This migration is known as older to newer. We need to make sure - than we are able to receive migrations from qemu-5.1. The problem is - similar to the previous one. - -If qemu-5.1 and qemu-5.2 were the same, there will not be any -compatibility problems. But the reason that we create qemu-5.2 is to -get new features, devices, defaults, etc. - -If we get a device that has a new feature, or change a default value, -we have a problem when we try to migrate between different QEMU -versions. - -So we need a way to tell qemu-5.2 that when we are using machine type -pc-5.1, it needs to **not** use the feature, to be able to migrate to -real qemu-5.1. - -And the equivalent part when migrating from qemu-5.1 to qemu-5.2. -qemu-5.2 has to expect that it is not going to get data for the new -feature, because qemu-5.1 doesn't know about it. - -How do we tell QEMU about these device feature changes? In -hw/core/machine.c:hw_compat_X_Y arrays. - -If we change a default value, we need to put back the old value on -that array. And the device, during initialization needs to look at -that array to see what value it needs to get for that feature. And -what are we going to put in that array, the value of a property. - -To create a property for a device, we need to use one of the -DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the -macros that exist. With it, we set the default value for that -property, and that is what it is going to get in the latest released -version. But if we want a different value for a previous version, we -can change that in the hw_compat_X_Y arrays. - -hw_compat_X_Y is an array of registers that have the format: - -- name_device -- name_property -- value - -Let's see a practical example. - -In qemu-5.2 virtio-blk-device got multi queue support. This is a -change that is not backward compatible. In qemu-5.1 it has one -queue. In qemu-5.2 it has the same number of queues as the number of -cpus in the system. - -When we are doing migration, if we migrate from a device that has 4 -queues to a device that have only one queue, we don't know where to -put the extra information for the other 3 queues, and we fail -migration. - -Similar problem when we migrate from qemu-5.1 that has only one queue -to qemu-5.2, we only sent information for one queue, but destination -has 4, and we have 3 queues that are not properly initialized and -anything can happen. - -So, how can we address this problem. Easy, just convince qemu-5.2 -that when it is running pc-5.1, it needs to set the number of queues -for virtio-blk-devices to 1. - -That way we fix the cases 5 and 6. - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - qemu-5.2 -M pc-5.1 sets number of queues to be 1. - qemu-5.1 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - qemu-5.1 -M pc-5.1 sets number of queues to be 1. - qemu-5.2 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -And now the other interesting case, case 3. In this case we have: - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - Here we have the same QEMU in both sides. So it doesn't matter a - lot if we have set the number of queues to 1 or not, because - they are the same. - - WRONG! - - Think what happens if we do one of this double migrations: - - A -> migrates -> B -> migrates -> C - - where: - - A: qemu-5.1 -M pc-5.1 - B: qemu-5.2 -M pc-5.1 - C: qemu-5.2 -M pc-5.1 - - migration A -> B is case 6, so number of queues needs to be 1. - - migration B -> C is case 3, so we don't care. But actually we - care because we haven't started the guest in qemu-5.2, it came - migrated from qemu-5.1. So to be in the safe place, we need to - always use number of queues 1 when we are using pc-5.1. - -Now, how was this done in reality? The following commit shows how it -was done:: - - commit 9445e1e15e66c19e42bea942ba810db28052cd05 - Author: Stefan Hajnoczi - Date: Tue Aug 18 15:33:47 2020 +0100 - - virtio-blk-pci: default num_queues to -smp N - -The relevant parts for migration are:: - - @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { - #endif - DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, - true), - - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), - + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, - + VIRTIO_BLK_AUTO_NUM_QUEUES), - DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), - -It changes the default value of num_queues. But it fishes it for old -machine types to have the right value:: - - @@ -31,6 +31,7 @@ - GlobalProperty hw_compat_5_1[] = { - ... - + { "virtio-blk-device", "num-queues", "1"}, - ... - }; - -A device with different features on both sides ----------------------------------------------- - -Let's assume that we are using the same QEMU binary on both sides, -just to make the things easier. But we have a device that has -different features on both sides of the migration. That can be -because the devices are different, because the kernel driver of both -devices have different features, whatever. - -How can we get this to work with migration. The way to do that is -"theoretically" easy. You have to get the features that the device -has in the source of the migration. The features that the device has -on the target of the migration, you get the intersection of the -features of both sides, and that is the way that you should launch -QEMU. - -Notice that this is not completely related to QEMU. The most -important thing here is that this should be handled by the managing -application that launches QEMU. If QEMU is configured correctly, the -migration will succeed. - -That said, actually doing it is complicated. Almost all devices are -bad at being able to be launched with only some features enabled. -With one big exception: cpus. - -You can read the documentation for QEMU x86 cpu models here: - -https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html - -See when they talk about migration they recommend that one chooses the -newest cpu model that is supported for all cpus. - -Let's say that we have: - -Host A: - -Device X has the feature Y - -Host B: - -Device X has not the feature Y - -If we try to migrate without any care from host A to host B, it will -fail because when migration tries to load the feature Y on -destination, it will find that the hardware is not there. - -Doing this would be the equivalent of doing with cpus: - -Host A: - -$ qemu-system-x86_64 -cpu host - -Host B: - -$ qemu-system-x86_64 -cpu host - -When both hosts have different cpu features this is guaranteed to -fail. Especially if Host B has less features than host A. If host A -has less features than host B, sometimes it works. Important word of -last sentence is "sometimes". - -So, forgetting about cpu models and continuing with the -cpu host -example, let's see that the differences of the cpus is that Host A and -B have the following features: - -Features: 'pcid' 'stibp' 'taa-no' -Host A: X X -Host B: X - -And we want to migrate between them, the way configure both QEMU cpu -will be: - -Host A: - -$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off - -Host B: - -$ qemu-system-x86_64 -cpu host,taa-no=off - -And you would be able to migrate between them. It is responsibility -of the management application or of the user to make sure that the -configuration is correct. QEMU doesn't know how to look at this kind -of features in general. - -Notice that we don't recommend to use -cpu host for migration. It is -used in this example because it makes the example simpler. - -Other devices have worse control about individual features. If they -want to be able to migrate between hosts that show different features, -the device needs a way to configure which ones it is going to use. - -In this section we have considered that we are using the same QEMU -binary in both sides of the migration. If we use different QEMU -versions process, then we need to have into account all other -differences and the examples become even more complicated. - -How to mitigate when we have a backward compatibility error ------------------------------------------------------------ - -We broke migration for old machine types continuously during -development. But as soon as we find that there is a problem, we fix -it. The problem is what happens when we detect after we have done a -release that something has gone wrong. - -Let see how it worked with one example. - -After the release of qemu-8.0 we found a problem when doing migration -of the machine type pc-7.2. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration fails - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration fails - -So clearly something fails when migration between qemu-7.2 and -qemu-8.0 with machine type pc-7.2. The error messages, and git bisect -pointed to this commit. - -In qemu-8.0 we got this commit:: - - commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 - Author: Jonathan Cameron - Date: Thu Mar 2 13:37:02 2023 +0000 - - hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register - - -The relevant bits of the commit for our example are this ones:: - - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -The patch changes how we configure PCI space for AER. But QEMU fails -when the PCI space configuration is different between source and -destination. - -The following commit shows how this got fixed:: - - commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f - Author: Leonardo Bras - Date: Tue May 2 21:27:02 2023 -0300 - - hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 - - [...] - -The relevant parts of the fix in QEMU are as follow: - -First, we create a new property for the device to be able to configure -the old behaviour or the new behaviour:: - - diff --git a/hw/pci/pci.c b/hw/pci/pci.c - index 8a87ccc8b0..5153ad63d6 100644 - --- a/hw/pci/pci.c - +++ b/hw/pci/pci.c - @@ -79,6 +79,8 @@ static Property pci_props[] = { - DEFINE_PROP_STRING("failover_pair_id", PCIDevice, - failover_pair_id), - DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), - + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, - + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), - DEFINE_PROP_END_OF_LIST() - }; - -Notice that we enable the feature for new machine types. - -Now we see how the fix is done. This is going to depend on what kind -of breakage happens, but in this case it is quite simple:: - - diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c - index 103667c368..374d593ead 100644 - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, - uint16_t offset, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_MASK_DEFAULT); - - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_SUPPORTED); - + - + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - + } - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -I.e. If the property bit is enabled, we configure it as we did for -qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. - -And now, everything that is missing is disabling the feature for old -machine types:: - - diff --git a/hw/core/machine.c b/hw/core/machine.c - index 47a34841a5..07f763eb2e 100644 - --- a/hw/core/machine.c - +++ b/hw/core/machine.c - @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { - { "e1000e", "migrate-timadj", "off" }, - { "virtio-mem", "x-early-migration", "false" }, - { "migration", "x-preempt-pre-7-2", "true" }, - + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, - }; - const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); - -And now, when qemu-8.0.1 is released with this fix, all combinations -are going to work as supposed. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) - -So the normality has been restored and everything is ok, no? - -Not really, now our matrix is much bigger. We started with the easy -cases, migration from the same version to the same version always -works: - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -Now the interesting ones. When the QEMU processes versions are -different. For the 1st set, their fail and we can do nothing, both -versions are released and we can't change anything. - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -This two are the ones that work. The whole point of making the -change in qemu-8.0.1 release was to fix this issue: - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -But now we found that qemu-8.0 neither can migrate to qemu-7.2 not -qemu-8.0.1. - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - -So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to -anything except to qemu-8.0. - -Can we do better? - -Yeap. If we know that we are going to do this migration: - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -We can launch the appropriate devices with:: - - --device...,x-pci-e-err-unc-mask=on - -And now we can receive a migration from 8.0. And from now on, we can -do that migration to new machine types if we remember to enable that -property for pc-7.2. Notice that we need to remember, it is not -enough to know that the source of the migration is qemu-8.0. Think of -this example: - -$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 - -In the second migration, the source is not qemu-8.0, but we still have -that "problem" and have that property enabled. Notice that we need to -continue having this mark/property until we have this machine -rebooted. But it is not a normal reboot (that don't reload QEMU) we -need the machine to poweroff/poweron on a fixed QEMU. And from now -on we can use the proper real machine. diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst new file mode 100644 index 0000000000000000000000000000000000000000..6c51e96d798418ecb048a71bba8b0847d971bcc7 --- /dev/null +++ b/docs/devel/migration/postcopy.rst @@ -0,0 +1,313 @@ +======== +Postcopy +======== + +.. contents:: + +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side causes the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +Enabling postcopy +================= + +To enable postcopy, issue this command on the monitor (both source and +destination) prior to the start of migration: + +``migrate_set_capability postcopy-ram on`` + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +``migrate_start_postcopy`` + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptible sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + +.. note:: + During the postcopy phase, the bandwidth limits set using + ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that + the destination is waiting for). + +Postcopy internals +================== + +State machine +------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Device transfer +--------------- + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour +---------------- + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + - Command: 'postcopy listen' + - The device state + + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + - Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour +--------------------- + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +:: + + ------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 + main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) + thread | | + | (page request) + | \___ + v \ + listen thread: --- page -- page -- page -- page -- page -- + + a b c + ------------------------------------------------------------------------------ + +- On receipt of ``CMD_PACKAGED`` (1) + + All the data associated with the package - the ( ... ) section in the diagram - + is read into memory, and the main thread recurses into qemu_loadvm_state_main + to process the contents of the package (2) which contains commands (3,6) and + devices (4...) + +- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) + + a new thread (a) is started that takes over servicing the migration stream, + while the main thread carries on loading the package. It loads normal + background page data (b) but if during a device load a fault happens (5) + the returned page (c) is loaded by the listen thread allowing the main + threads device load to carry on. + +- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) + + letting the destination CPUs start running. At the end of the + ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +Source side page bitmap +----------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy features +================= + +Postcopy recovery +----------------- + +Comparing to precopy, postcopy is special on error handlings. When any +error happens (in this case, mostly network errors), QEMU cannot easily +fail a migration because VM data resides in both source and destination +QEMU instances. On the other hand, when issue happens QEMU on both sides +will go into a paused state. It'll need a recovery phase to continue a +paused postcopy migration. + +The recovery phase normally contains a few steps: + + - When network issue occurs, both QEMU will go into PAUSED state + + - When the network is recovered (or a new network is provided), the admin + can setup the new channel for migration using QMP command + 'migrate-recover' on destination node, preparing for a resume. + + - On source host, the admin can continue the interrupted postcopy + migration using QMP command 'migrate' with resume=true flag set. + + - After the connection is re-established, QEMU will continue the postcopy + migration on both sides. + +During a paused postcopy migration, the VM can logically still continue +running, and it will not be impacted from any page access to pages that +were already migrated to destination VM before the interruption happens. +However, if any of the missing pages got accessed on destination VM, the VM +thread will be halted waiting for the page to be migrated, it means it can +be halted until the recovery is complete. + +The impact of accessing missing pages can be relevant to different +configurations of the guest. For example, when with async page fault +enabled, logically the guest can proactively schedule out the threads +accessing missing pages. + +Postcopy with hugepages +----------------------- + +Postcopy now works with hugetlbfs backed memory: + + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using ``-mem-prealloc`` enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. + +Postcopy with shared memory +--------------------------- + +Postcopy migration with shared memory needs explicit support from the other +processes that share memory and from QEMU. There are restrictions on the type of +memory that userfault can support shared. + +The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` +(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` +for hugetlbfs which may be a problem in some configurations). + +The vhost-user code in QEMU supports clients that have Postcopy support, +and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes +to support postcopy. + +The client needs to open a userfaultfd and register the areas +of memory that it maps with userfault. The client must then pass the +userfaultfd back to QEMU together with a mapping table that allows +fault addresses in the clients address space to be converted back to +RAMBlock/offsets. The client's userfaultfd is added to the postcopy +fault-thread and page requests are made on behalf of the client by QEMU. +QEMU performs 'wake' operations on the client's userfaultfd to allow it +to continue after a page has arrived. + +.. note:: + There are two future improvements that would be nice: + a) Some way to make QEMU ignorant of the addresses in the clients + address space + b) Avoiding the need for QEMU to perform ufd-wake calls after the + pages have arrived + +Retro-fitting postcopy to existing clients is possible: + a) A mechanism is needed for the registration with userfault as above, + and the registration needs to be coordinated with the phases of + postcopy. In vhost-user extra messages are added to the existing + control channel. + b) Any thread that can block due to guest memory accesses must be + identified and the implication understood; for example if the + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. + +Postcopy preemption mode +------------------------ + +Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +allows urgent pages (those got page fault requested from destination QEMU +explicitly) to be sent in a separate preempt channel, rather than queued in +the background migration channel. Anyone who cares about latencies of page +faults during a postcopy migration should enable this feature. By default, +it's not enabled. diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst new file mode 100644 index 0000000000000000000000000000000000000000..862b3831647b3e8766c308ae1067c1baa8aa8f3e --- /dev/null +++ b/docs/devel/migration/qatzip-compression.rst @@ -0,0 +1,165 @@ +================== +QATzip Compression +================== +In scenarios with limited network bandwidth, the ``QATzip`` solution can help +users save a lot of host CPU resources by accelerating compression and +decompression through the Intel QuickAssist Technology(``QAT``) hardware. + + +The following test was conducted using 8 multifd channels and 10Gbps network +bandwidth. The results show that, compared to zstd, ``QATzip`` significantly +saves CPU resources on the sender and reduces migration time. Compared to the +uncompressed solution, ``QATzip`` greatly improves the dirty page processing +capability, indicated by the Pages per Second metric, and also reduces the +total migration time. + +:: + + VM Configuration: 16 vCPU and 64G memory + VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data. + QAT Devices: 4 + |-----------|--------|---------|----------|----------|------|------| + |8 Channels |Total |down |throughput|pages per | send | recv | + | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% | + |-----------|--------|---------|----------|----------|------|------| + |qatzip | 16630| 28| 10467| 2940235| 160| 360| + |-----------|--------|---------|----------|----------|------|------| + |zstd | 20165| 24| 8579| 2391465| 810| 340| + |-----------|--------|---------|----------|----------|------|------| + |none | 46063| 40| 10848| 330240| 45| 85| + |-----------|--------|---------|----------|----------|------|------| + + +QATzip Compression Framework +============================ + +``QATzip`` is a user space library which builds on top of the Intel QuickAssist +Technology to provide extended accelerated compression and decompression +services. + +For more ``QATzip`` introduction, please refer to `QATzip Introduction +`_ + +:: + + +----------------+ + | MultiFd Thread | + +-------+--------+ + | + | compress/decompress + +-------+--------+ + | QATzip library | + +-------+--------+ + | + +-------+--------+ + | QAT library | + +-------+--------+ + | user space + --------+--------------------- + | kernel space + +------+-------+ + | QAT Driver | + +------+-------+ + | + +------+-------+ + | QAT Devices | + +--------------+ + + +QATzip Installation +------------------- + +The ``QATzip`` installation package has been integrated into some Linux +distributions and can be installed directly. For example, the Ubuntu Server +24.04 LTS system can be installed using below command + +.. code-block:: shell + + #apt search qatzip + libqatzip-dev/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library development files + + libqatzip3/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library + + qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed] + Compression user-space tool for Intel QuickAssist Technology + + #sudo apt install libqatzip-dev libqatzip3 qatzip + +If your system does not support the ``QATzip`` installation package, you can +use the source code to build and install, please refer to `QATzip source code installation +`_ + +QAT Hardware Deployment +----------------------- + +``QAT`` supports physical functions(PFs) and virtual functions(VFs) for +deployment, and users can configure ``QAT`` resources for migration according +to actual needs. For more details about ``QAT`` deployment, please refer to +`Intel QuickAssist Technology Documentation +`_ + +For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview +`_ + +How To Use QATzip Compression +============================= + +1 - Install ``QATzip`` library + +2 - Build ``QEMU`` with ``--enable-qatzip`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip`` + +3 - Set ``migrate_set_parameter multifd-compression qatzip`` + +4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default +comp_level value is 1, and it supports levels from 1 to 9 + +QAT Memory Requirements +======================= + +The user needs to reserve system memory for the QAT memory management to +allocate DMA memory. The size of the reserved system memory depends on the +number of devices used for migration and the number of multifd channels. + +Because memory usage depends on QAT configuration, please refer to `QAT Memory +Driver Queries +`_ +for memory usage calculation. + +.. list-table:: An example of a PF used for migration + :header-rows: 1 + + * - Number of channels + - Sender memory usage + - Receiver memory usage + * - 2 + - 10M + - 10M + * - 4 + - 12M + - 14M + * - 8 + - 16M + - 20M + +How To Choose Between QATzip and QPL +==================================== +Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``), multiple built-in accelerators are supported including +``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is +used to accelerate ``QPL``. + +Here are some suggestions: + +1 - If the live migration scenario is limited by network bandwidth and ``QAT`` +hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a +lot of host CPU resources for compression. + +2 - If the system cannot support shared virtual memory (SVM) technology, use +the ``QATzip`` method because ``QPL`` performance is not good without SVM +support. + +3 - For other scenarios, use the ``QPL`` method first. diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst new file mode 100644 index 0000000000000000000000000000000000000000..990992d78655b1d1ce7bb07f5fc46cc8c6b3dd81 --- /dev/null +++ b/docs/devel/migration/qpl-compression.rst @@ -0,0 +1,260 @@ +=============== +QPL Compression +=============== +The Intel Query Processing Library (Intel ``QPL``) is an open-source library to +provide compression and decompression features and it is based on deflate +compression algorithm (RFC 1951). + +The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``) +and Shared Virtual Memory(``SVM``) technology, they are new features supported +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``). + +For more ``QPL`` introduction, please refer to `QPL Introduction +`_ + +QPL Compression Framework +========================= + +:: + + +----------------+ +------------------+ + | MultiFD Thread | |accel-config tool | + +-------+--------+ +--------+---------+ + | | + | | + |compress/decompress | + +-------+--------+ | Setup IAA + | QPL library | | Resources + +-------+---+----+ | + | | | + | +-------------+-------+ + | Open IAA | + | Devices +-----+-----+ + | |idxd driver| + | +-----+-----+ + | | + | | + | +-----+-----+ + +-----------+IAA Devices| + Submit jobs +-----------+ + via enqcmd + + +QPL Build And Installation +-------------------------- + +.. code-block:: shell + + $git clone --recursive https://github.com/intel/qpl.git qpl + $mkdir qpl/build + $cd qpl/build + $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED .. + $sudo cmake --build . --target install + +For more details about ``QPL`` installation, please refer to `QPL Installation +`_ + +IAA Device Management +--------------------- + +The number of ``IAA`` devices will vary depending on the Xeon product model. +On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to +4 devices per socket. + +By default, all ``IAA`` devices are disabled and need to be configured and +enabled by users manually. + +Check the number of devices through the following command + +.. code-block:: shell + + #lspci -d 8086:0cfe + 6a:02.0 System peripheral: Intel Corporation Device 0cfe + 6f:02.0 System peripheral: Intel Corporation Device 0cfe + 74:02.0 System peripheral: Intel Corporation Device 0cfe + 79:02.0 System peripheral: Intel Corporation Device 0cfe + e7:02.0 System peripheral: Intel Corporation Device 0cfe + ec:02.0 System peripheral: Intel Corporation Device 0cfe + f1:02.0 System peripheral: Intel Corporation Device 0cfe + f6:02.0 System peripheral: Intel Corporation Device 0cfe + +IAA Device Configuration And Enabling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``accel-config`` tool is used to enable ``IAA`` devices and configure +``IAA`` hardware resources(work queues and engines). One ``IAA`` device +has 8 work queues and 8 processing engines, multiple engines can be assigned +to a work queue via ``group`` attribute. + +For ``accel-config`` installation, please refer to `accel-config installation +`_ + +One example of configuring and enabling an ``IAA`` device. + +.. code-block:: shell + + #accel-config config-engine iax1/engine1.0 -g 0 + #accel-config config-engine iax1/engine1.1 -g 0 + #accel-config config-engine iax1/engine1.2 -g 0 + #accel-config config-engine iax1/engine1.3 -g 0 + #accel-config config-engine iax1/engine1.4 -g 0 + #accel-config config-engine iax1/engine1.5 -g 0 + #accel-config config-engine iax1/engine1.6 -g 0 + #accel-config config-engine iax1/engine1.7 -g 0 + #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user + #accel-config enable-device iax1 + #accel-config enable-wq iax1/wq1.0 + +.. note:: + IAX is an early name for IAA + +- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*`` + command to query the ``IAA`` device index. + +- 8 engines and 1 work queue are configured in group 0, so all compression jobs + submitted to this work queue can be processed by all engines at the same time. + +- Set work queue attributes including the work mode, work queue size and so on. + +- Enable the ``IAA1`` device and work queue 1.0 + +.. note:: + + Set work queue mode to shared mode, since ``QPL`` library only supports + shared mode + +For more detailed configuration, please refer to `IAA Configuration Samples +`_ + +IAA Unit Test +^^^^^^^^^^^^^ + +- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide + `_ + +- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is + recommended that the minimum version of Linux kernel is 5.18. + +- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line + for ``SVM`` feature enabling. + +Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test +`_ + +.. code-block:: shell + + #./test/iaa_test + [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000 + [ info] test noop: tflags 0x1 num_desc 1 + [ info] preparing descriptor for noop + [ info] Submitted all noop jobs + [ info] verifying task result for 0x16f7e20 + [ info] test with op 0 passed + + +IAA Resources Allocation For Migration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is no ``IAA`` resource configuration parameters for migration and +``accel-config`` tool configuration cannot directly specify the ``IAA`` +resources used for migration. + +The multifd migration with ``QPL`` compression method will use all work +queues that are enabled and shared mode. + +.. note:: + + Accessing IAA resources requires ``sudo`` command or ``root`` privileges + by default. Administrators can modify the IAA device node ownership + so that QEMU can use IAA with specified user permissions. + + For example + + #chown -R qemu /dev/iax + +Shared Virtual Memory(SVM) Introduction +======================================= + +An ability for an accelerator I/O device to operate in the same virtual +memory space of applications on host processors. It also implies the +ability to operate from pageable memory, avoiding functional requirements +to pin memory for DMA operations. + +When using ``SVM`` technology, users do not need to reserve memory for the +``IAA`` device and perform pin memory operation. The ``IAA`` device can +directly access data using the virtual address of the process. + +For more ``SVM`` technology, please refer to +`Shared Virtual Addressing (SVA) with ENQCMD +`_ + + +How To Use QPL Compression In Migration +======================================= + +1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA + +2 - Configure and enable ``IAA`` devices and work queues via ``accel-config`` + +3 - Build ``QEMU`` with ``--enable-qpl`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl`` + +4 - Enable ``QPL`` compression during migration + + Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the + ``QPL`` compression does not support configuring the compression level, it + only supports one compression level. + +The Difference Between QPL And ZLIB +=================================== + +Although both ``QPL`` and ``ZLIB`` are based on the deflate compression +algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL`` +is still not fully compatible with the ``ZLIB`` compression in the migration. + +``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default. +``ZLIB`` compresses data that ``QPL`` may not decompress correctly and +vice versa. + +``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming +compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each +``multifd`` thread has a ``ZLIB`` streaming context, and all page compression +and decompression are based on this stream. ``QPL`` cannot decompress such data +and vice versa. + +The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual +`_ + +The Best Practices +================== +When user enables the IAA device for ``QPL`` compression, it is recommended +to add ``-mem-prealloc`` parameter to the destination boot parameters. This +parameter can avoid the occurrence of I/O page fault and reduce the overhead +of IAA compression and decompression. + +The example of booting with ``-mem-prealloc`` parameter + +.. code-block:: shell + + $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ... + + +An example about I/O page fault measurement of destination without +``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault +occurrences and processing time. + +.. code-block:: shell + + #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency + IOMMU: dmar18 Register Base Address: c87fc000 + <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms 1ms-10ms >=10ms min(us) max(us) average(us) + inv_iotlb 0 286 123 0 0 0 0 0 1 0 + inv_devtlb 0 276 133 0 0 0 0 0 2 0 + inv_iec 0 0 0 0 0 0 0 0 0 0 + svm_prq 0 0 25206 364 395 0 0 1 556 9 diff --git a/docs/devel/migration/uadk-compression.rst b/docs/devel/migration/uadk-compression.rst new file mode 100644 index 0000000000000000000000000000000000000000..3f73345dd531826b8246c563ee8a860a8cda61de --- /dev/null +++ b/docs/devel/migration/uadk-compression.rst @@ -0,0 +1,144 @@ +========================================================= +User Space Accelerator Development Kit (UADK) Compression +========================================================= +UADK is a general-purpose user space accelerator framework that uses shared +virtual addressing (SVA) to provide a unified programming interface for +hardware acceleration of cryptographic and compression algorithms. + +UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE), +which enables hardware accelerators from different vendors that support SVA to +adapt to UADK. + +Currently, HiSilicon Kunpeng hardware accelerators have been registered with +UACCE. Through the UADK framework, users can run cryptographic and compression +algorithms using hardware accelerators instead of CPUs, freeing up CPU +computing power and improving computing performance. + +https://github.com/Linaro/uadk/tree/master/docs + +UADK Framework +============== +UADK consists of UACCE, vendors' drivers, and an algorithm layer. UADK requires +the hardware accelerator to support SVA, and the operating system to support +IOMMU and SVA. Hardware accelerators from different vendors are registered as +different character devices with UACCE by using kernel-mode drivers of the +vendors. A user can access the hardware accelerators by performing user-mode +operations on the character devices. + +:: + + +----------------------------------+ + | apps | + +----+------------------------+----+ + | | + | | + +-------+--------+ +-------+-------+ + | scheduler | | alg libraries | + +-------+--------+ +-------+-------+ + | | + | | + | | + | +--------+------+ + | | vendor drivers| + | +-+-------------+ + | | + | | + +--+------------------+--+ + | libwd | + User +----+-------------+-----+ + -------------------------------------------------- + Kernel +--+-----+ +------+ + | uacce | | smmu | + +---+----+ +------+ + | + +---+------------------+ + | vendor kernel driver | + +----------------------+ + -------------------------------------------------- + +----------------------+ + | HW Accelerators | + +----------------------+ + +UADK Installation +----------------- +Build UADK +^^^^^^^^^^ + +.. code-block:: shell + + git clone https://github.com/Linaro/uadk.git + cd uadk + mkdir build + ./autogen.sh + ./configure --prefix=$PWD/build + make + make install + +Without --prefix, UADK will be installed to /usr/local/lib by default. +If get error:"cannot find -lnuma", please install the libnuma-dev + +Run pkg-config libwd to ensure env is setup correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* export PKG_CONFIG_PATH=$PWD/build/lib/pkgconfig +* pkg-config libwd --cflags --libs + -I/usr/local/include -L/usr/local/lib -lwd + +* export PKG_CONFIG_PATH is required on demand. + Not required if UADK is installed to /usr/local/lib + +UADK Host Kernel Requirements +----------------------------- +User needs to make sure that ``UACCE`` is already supported in Linux kernel. +The kernel version should be at least v5.9 with SVA (Shared Virtual +Addressing) enabled. + +Kernel Configuration +^^^^^^^^^^^^^^^^^^^^ + +``UACCE`` could be built as module or built-in. + +Here's an example to enable UACCE with hardware accelerator in HiSilicon +Kunpeng platform. + +* CONFIG_IOMMU_SVA_LIB=y +* CONFIG_ARM_SMMU=y +* CONFIG_ARM_SMMU_V3=y +* CONFIG_ARM_SMMU_V3_SVA=y +* CONFIG_PCI_PASID=y +* CONFIG_UACCE=y +* CONFIG_CRYPTO_DEV_HISI_QM=y +* CONFIG_CRYPTO_DEV_HISI_ZIP=y + +Make sure all these above kernel configurations are selected. + +Accelerator dev node permissions +-------------------------------- +Harware accelerators(eg: HiSilicon Kunpeng Zip accelerator) gets registered to +UADK and char devices are created in dev directory. In order to access resources +on hardware accelerator devices, write permission should be provided to user. + +.. code-block:: shell + + $ sudo chmod 777 /dev/hisi_zip-* + +How To Use UADK Compression In QEMU Migration +--------------------------------------------- +* Make sure UADK is installed as above +* Build ``QEMU`` with ``--enable-uadk`` parameter + + E.g. configure --target-list=aarch64-softmmu --enable-kvm ``--enable-uadk`` + +* Enable ``UADK`` compression during migration + + Set ``migrate_set_parameter multifd-compression uadk`` + +Since UADK uses Shared Virtual Addressing(SVA) and device access virtual memory +directly it is possible that SMMUv3 may enounter page faults while walking the +IO page tables. This may impact the performance. In order to mitigate this, +please make sure to specify ``-mem-prealloc`` parameter to the destination VM +boot parameters. + +Though both UADK and ZLIB are based on the deflate compression algorithm, UADK +is not fully compatible with ZLIB. Hence, please make sure to use ``uadk`` on +both source and destination during migration. diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst similarity index 99% rename from docs/devel/vfio-migration.rst rename to docs/devel/migration/vfio.rst index 605fe60e9695a50813b1294bb970ce2c39d2ba07..c49482eab66d8e831ea1c2c791fc895b51893e4d 100644 --- a/docs/devel/vfio-migration.rst +++ b/docs/devel/migration/vfio.rst @@ -1,5 +1,5 @@ ===================== -VFIO device Migration +VFIO device migration ===================== Migration of virtual machine involves saving the state for each device that diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.rst similarity index 37% rename from docs/devel/virtio-migration.txt rename to docs/devel/migration/virtio.rst index 98a6b0ffb5731034be51158bba0ebdd04ddb07bd..611a18b821519e51fc5ddc84b1b5049a8219387a 100644 --- a/docs/devel/virtio-migration.txt +++ b/docs/devel/migration/virtio.rst @@ -1,5 +1,6 @@ -Virtio devices and migration -============================ +======================= +Virtio device migration +======================= Copyright 2015 IBM Corp. @@ -8,91 +9,97 @@ the COPYING file in the top-level directory. Saving and restoring the state of virtio devices is a bit of a twisty maze, for several reasons: + - state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their state (mac address, request queue, ...) + - most fields are saved via the stream interface; subsequently, subsections have been added to make cross-version migration possible This file attempts to document the current procedure and point out some caveats. - Save state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - save() function registered - via VMState wrapper on - device class -virtio_save() <---------- - ------> save_config() - - save proxy device - - save transport-specific - device fields -- save common device - fields -- save common virtqueue - fields - ------> save_queue() - - save transport-specific - virtqueue fields - ------> save_device() - - save device-specific - fields -- save subsections - - device endianness, - if changed from - default endianness - - 64 bit features, if - any high feature bit - is set - - virtio-1 virtqueue - fields, if VERSION_1 - is set +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + save() function registered + via VMState wrapper on + device class + virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields + - save common device + fields + - save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields + - save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set Load state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - load() function registered - via VMState wrapper on - device class -virtio_load() <---------- - ------> load_config() - - load proxy device - - load transport-specific - device fields -- load common device - fields -- load common virtqueue - fields - ------> load_queue() - - load transport-specific - virtqueue fields -- notify guest - ------> load_device() - - load device-specific - fields -- load subsections - - device endianness - - 64 bit features - - virtio-1 virtqueue - fields -- sanitize endianness -- sanitize features -- virtqueue index sanity - check - - feature-dependent setup +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + load() function registered + via VMState wrapper on + device class + virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields + - load common device + fields + - load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields + - notify guest + ------> load_device() + - load device-specific + fields + - load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields + - sanitize endianness + - sanitize features + - virtqueue index sanity + check + - feature-dependent setup Implications of this setup ========================== diff --git a/docs/devel/vfio-iommufd.rst b/docs/devel/vfio-iommufd.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d1c11f175e5968e9f1519da70c9a0a6ced03995 --- /dev/null +++ b/docs/devel/vfio-iommufd.rst @@ -0,0 +1,166 @@ +=============================== +IOMMUFD BACKEND usage with VFIO +=============================== + +(Same meaning for backend/container/BE) + +With the introduction of iommufd, the Linux kernel provides a generic +interface for user space drivers to propagate their DMA mappings to kernel +for assigned devices. While the legacy kernel interface is group-centric, +the new iommufd interface is device-centric, relying on device fd and iommufd. + +To support both interfaces in the QEMU VFIO device, introduce a base container +to abstract the common part of VFIO legacy and iommufd container. So that the +generic VFIO code can use either container. + +The base container implements generic functions such as memory_listener and +address space management whereas the derived container implements callbacks +specific to either legacy or iommufd. Each container has its own way to setup +secure context and dma management interface. The below diagram shows how it +looks like with both containers. + +:: + + VFIO AddressSpace/Memory + +-------+ +----------+ +-----+ +-----+ + | pci | | platform | | ap | | ccw | + +---+---+ +----+-----+ +--+--+ +--+--+ +----------------------+ + | | | | | AddressSpace | + | | | | +------------+---------+ + +---V-----------V-----------V--------V----+ / + | VFIOAddressSpace | <------------+ + | | | MemoryListener + | VFIOContainerBase list | + +-------+----------------------------+----+ + | | + | | + +-------V------+ +--------V----------+ + | iommufd | | vfio legacy | + | container | | container | + +-------+------+ +--------+----------+ + | | + | /dev/iommu | /dev/vfio/vfio + | /dev/vfio/devices/vfioX | /dev/vfio/$group_id + Userspace | | + ============+============================+=========================== + Kernel | device fd | + +---------------+ | group/container fd + | (BIND_IOMMUFD | | (SET_CONTAINER/SET_IOMMU) + | ATTACH_IOAS) | | device fd + | | | + | +-------V------------V-----------------+ + iommufd | | vfio | + (map/unmap | +---------+--------------------+-------+ + ioas_copy) | | | map/unmap + | | | + +------V------+ +-----V------+ +------V--------+ + | iommfd core | | device | | vfio iommu | + +-------------+ +------------+ +---------------+ + +* Secure Context setup + + - iommufd BE: uses device fd and iommufd to setup secure context + (bind_iommufd, attach_ioas) + - vfio legacy BE: uses group fd and container fd to setup secure context + (set_container, set_iommu) + +* Device access + + - iommufd BE: device fd is opened through ``/dev/vfio/devices/vfioX`` + - vfio legacy BE: device fd is retrieved from group fd ioctl + +* DMA Mapping flow + + 1. VFIOAddressSpace receives MemoryRegion add/del via MemoryListener + 2. VFIO populates DMA map/unmap via the container BEs + * iommufd BE: uses iommufd + * vfio legacy BE: uses container fd + +Example configuration +===================== + +Step 1: configure the host device +--------------------------------- + +It's exactly same as the VFIO device with legacy VFIO container. + +Step 2: configure QEMU +---------------------- + +Interactions with the ``/dev/iommu`` are abstracted by a new iommufd +object (compiled in with the ``CONFIG_IOMMUFD`` option). + +Any QEMU device (e.g. VFIO device) wishing to use ``/dev/iommu`` must +be linked with an iommufd object. It gets a new optional property +named iommufd which allows to pass an iommufd object. Take ``vfio-pci`` +device for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0 + +Note the ``/dev/iommu`` and VFIO cdev can be externally opened by a +management layer. In such a case the fd is passed, the fd supports a +string naming the fd or a number, for example: + +.. code-block:: bash + + -object iommufd,id=iommufd0,fd=22 + -device vfio-pci,iommufd=iommufd0,fd=23 + +If the ``fd`` property is not passed, the fd is opened by QEMU. + +If no ``iommufd`` object is passed to the ``vfio-pci`` device, iommufd +is not used and the user gets the behavior based on the legacy VFIO +container: + +.. code-block:: bash + + -device vfio-pci,host=0000:02:00.0 + +Supported platform +================== + +Supports x86, ARM and s390x currently. + +Caveats +======= + +Dirty page sync +--------------- + +Dirty page sync with iommufd backend is unsupported yet, live migration is +disabled by default. But it can be force enabled like below, low efficient +though. + +.. code-block:: bash + + -object iommufd,id=iommufd0 + -device vfio-pci,host=0000:02:00.0,iommufd=iommufd0,enable-migration=on + +P2P DMA +------- + +PCI p2p DMA is unsupported as IOMMUFD doesn't support mapping hardware PCI +BAR region yet. Below warning shows for assigned PCI device, it's not a bug. + +.. code-block:: none + + qemu-system-x86_64: warning: IOMMU_IOAS_MAP failed: Bad address, PCI BAR? + qemu-system-x86_64: vfio_container_dma_map(0x560cb6cb1620, 0xe000000021000, 0x3000, 0x7f32ed55c000) = -14 (Bad address) + +FD passing with mdev +-------------------- + +``vfio-pci`` device checks sysfsdev property to decide if backend is a mdev. +If FD passing is used, there is no way to know that and the mdev is treated +like a real PCI device. There is an error as below if user wants to enable +RAM discarding for mdev. + +.. code-block:: none + + qemu-system-x86_64: -device vfio-pci,iommufd=iommufd0,x-balloon-allowed=on,fd=9: vfio VFIO_FD9: x-balloon-allowed only potentially compatible with mdev devices + +``vfio-ap`` and ``vfio-ccw`` devices don't have same issue as their backend +devices are always mdev and RAM discarding is force enabled. diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt index 95b1556f67b8a256176819296e1b152a99003096..450e5de469b33ef605e5b9ddffb500971290ce8b 100644 --- a/docs/multi-thread-compression.txt +++ b/docs/multi-thread-compression.txt @@ -33,14 +33,15 @@ thread compression can be used to accelerate the compression process. The decompression speed of Zlib is at least 4 times as quick as compression, if the source and destination CPU have equal speed, -keeping the compression thread count 4 times the decompression -thread count can avoid resource waste. +and you choose Zlib as compression method, keeping the compression +thread count 4 times the decompression thread count can avoid resource waste. Compression level can be used to control the compression speed and the -compression ratio. High compression ratio will take more time, level 0 -stands for no compression, level 1 stands for the best compression -speed, and level 9 stands for the best compression ratio. Users can -select a level number between 0 and 9. +compression ratio. High compression ratio will take more time, +level 1 stands for the best compression speed, and higher level means higher +compression ration. For Zlib, users can select a level number between 0 and 9, +where level 0 stands for no compression. For Zstd, users can select a +level number between 1 and 22. When to use the multiple thread compression in live migration @@ -116,16 +117,19 @@ to support the multiple thread compression migration: 2. Activate compression on the source: {qemu} migrate_set_capability compress on -3. Set the compression thread count on source: +3. Set the compression method: + {qemu} migrate_set_parameter compress_method zstd + +4. Set the compression thread count on source: {qemu} migrate_set_parameter compress-threads 12 -4. Set the compression level on the source: +5. Set the compression level on the source: {qemu} migrate_set_parameter compress-level 1 -5. Set the decompression thread count on destination: +6. Set the decompression thread count on destination: {qemu} migrate_set_parameter decompress-threads 3 -6. Start outgoing migration: +7. Start outgoing migration: {qemu} migrate -d tcp:destination.host:4444 {qemu} info migrate Capabilities: ... compress: on @@ -136,6 +140,7 @@ The following are the default settings: compress-threads: 8 decompress-threads: 2 compress-level: 1 (which means best speed) + compress_method: zlib So, only the first two steps are required to use the multiple thread compression in migration. You can do more if the default @@ -143,7 +148,7 @@ settings are not appropriate. TODO ==== -Some faster (de)compression method such as LZ4 and Quicklz can help -to reduce the CPU consumption when doing (de)compression. If using -these faster (de)compression method, less (de)compression threads +Comparing to Zlib, Some faster (de)compression method such as LZ4 +and Quicklz can help to reduce the CPU consumption when doing (de)compression. +If using these faster (de)compression method, less (de)compression threads are needed when doing the migration. diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst index 0bd3f9399fee04bc7829f6f5936a398720dc7eeb..3acd6fcd8b8fd84492391f702643b29af935b2c7 100644 --- a/docs/specs/acpi_hw_reduced_hotplug.rst +++ b/docs/specs/acpi_hw_reduced_hotplug.rst @@ -64,7 +64,8 @@ GED IO interface (4 byte access) 0: Memory hotplug event 1: System power down event 2: NVDIMM hotplug event - 3-31: Reserved + 3: CPU hotplug event + 4-31: Reserved **write_access:** diff --git a/docs/sphinx/depfile.py b/docs/sphinx/depfile.py index afdcbcec6e76b5be3071046cec6006c795fb4ac2..e74be6af98bcd57355f7f85b268d2f65d6af1ac3 100644 --- a/docs/sphinx/depfile.py +++ b/docs/sphinx/depfile.py @@ -19,7 +19,7 @@ def get_infiles(env): for x in env.found_docs: - yield env.doc2path(x) + yield str(env.doc2path(x)) yield from ((os.path.join(env.srcdir, dep) for dep in env.dependencies[x])) for mod in sys.modules.values(): diff --git a/docs/sphinx/qapidoc.py b/docs/sphinx/qapidoc.py index 658c288f8fe9b674e8c7230053658f44934145e5..3d19853444247613e59447033dca89a27ac38efe 100644 --- a/docs/sphinx/qapidoc.py +++ b/docs/sphinx/qapidoc.py @@ -229,15 +229,15 @@ def _nodes_for_enum_values(self, doc): section += dlnode return [section] - def _nodes_for_arguments(self, doc, boxed_arg_type): + def _nodes_for_arguments(self, doc, arg_type): """Return list of doctree nodes for the arguments section""" - if boxed_arg_type: + if arg_type and not arg_type.is_implicit(): assert not doc.args section = self._make_section('Arguments') dlnode = nodes.definition_list() dlnode += self._make_dlitem( [nodes.Text('The members of '), - nodes.literal('', boxed_arg_type.name)], + nodes.literal('', arg_type.name)], None) section += dlnode return [section] @@ -341,8 +341,7 @@ def visit_command(self, name, info, ifcond, features, arg_type, allow_preconfig, coroutine): doc = self._cur_doc self._add_doc('Command', - self._nodes_for_arguments(doc, - arg_type if boxed else None) + self._nodes_for_arguments(doc, arg_type) + self._nodes_for_features(doc) + self._nodes_for_sections(doc) + self._nodes_for_if_section(ifcond)) @@ -350,8 +349,7 @@ def visit_command(self, name, info, ifcond, features, arg_type, def visit_event(self, name, info, ifcond, features, arg_type, boxed): doc = self._cur_doc self._add_doc('Event', - self._nodes_for_arguments(doc, - arg_type if boxed else None) + self._nodes_for_arguments(doc, arg_type) + self._nodes_for_features(doc) + self._nodes_for_sections(doc) + self._nodes_for_if_section(ifcond)) diff --git a/docs/system/cpu-models-x86.rst.inc b/docs/system/cpu-models-x86.rst.inc index 7f6368f999b10dc943c0c084e4b3253834674c46..37fe1d0ac8197c70d92ea7f62c3b5ad3a17987b3 100644 --- a/docs/system/cpu-models-x86.rst.inc +++ b/docs/system/cpu-models-x86.rst.inc @@ -71,6 +71,16 @@ mixture of host CPU models between machines, if live migration compatibility is required, use the newest CPU model that is compatible across all desired hosts. +``ClearwaterForest`` + Intel Xeon Processor (ClearwaterForest, 2025) + +``SierraForest``, ``SierraForest-v2`` + Intel Xeon Processor (SierraForest, 2024), SierraForest-v2 mitigates + the GDS and RFDS vulnerabilities with stepping 3. + +``GraniteRapids``, ``GraniteRapids-v2`` + Intel Xeon Processor (GraniteRapids, 2024) + ``Cascadelake-Server``, ``Cascadelake-Server-noTSX`` Intel Xeon Processor (Cascade Lake, 2019), with "stepping" levels 6 or 7 only. (The Cascade Lake Xeon processor with *stepping 5 is @@ -181,7 +191,7 @@ features are included if using "Host passthrough" or "Host model". CVE-2018-12127, [MSBDS] CVE-2018-12126). This is an MSR (Model-Specific Register) feature rather than a CPUID feature, - so it will not appear in the Linux ``/proc/cpuinfo`` in the host or + therefore it will not appear in the Linux ``/proc/cpuinfo`` in the host or guest. Instead, the host kernel uses it to populate the MDS vulnerability file in ``sysfs``. @@ -189,10 +199,10 @@ features are included if using "Host passthrough" or "Host model". affected} in the ``/sys/devices/system/cpu/vulnerabilities/mds`` file. ``taa-no`` - Recommended to inform that the guest that the host is ``not`` + Recommended to inform the guest that the host is ``not`` vulnerable to CVE-2019-11135, TSX Asynchronous Abort (TAA). - This too is an MSR feature, so it does not show up in the Linux + This is also an MSR feature, therefore it does not show up in the Linux ``/proc/cpuinfo`` in the host or guest. It should only be enabled for VMs if the host reports ``Not affected`` @@ -214,7 +224,7 @@ features are included if using "Host passthrough" or "Host model". By disabling TSX, KVM-based guests can avoid paying the price of mitigating TSX-based attacks. - Note that ``tsx-ctrl`` too is an MSR feature, so it does not show + Note that ``tsx-ctrl`` is also an MSR feature, therefore it does not show up in the Linux ``/proc/cpuinfo`` in the host or guest. To validate that Intel TSX is indeed disabled for the guest, there are @@ -223,6 +233,38 @@ features are included if using "Host passthrough" or "Host model". ``/sys/devices/system/cpu/vulnerabilities/tsx_async_abort`` file in the guest should report ``Mitigation: TSX disabled``. +``bhi-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2022-0001, Branch History Injection (BHI). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports + ``BHI: Not affected`` in the + ``/sys/devices/system/cpu/vulnerabilities/spectre_v2`` file. + +``gds-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2022-40982, Gather Data Sampling (GDS). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports ``Not affected`` + in the ``/sys/devices/system/cpu/vulnerabilities/gather_data_sampling`` + file. + +``rfds-no`` + Recommended to inform the guest that the host is ``not`` + vulnerable to CVE-2023-28746, Register File Data Sampling (RFDS). + + This is also an MSR feature, therefore it does not show up in the Linux + ``/proc/cpuinfo`` in the host or guest. + + It should only be enabled for VMs if the host reports ``Not affected`` + in the ``/sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling`` + file. Preferred CPU models for AMD x86 hosts ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst index d1f3277cb026281d34e6ab9257eed30c5e640f89..e1b2d18fb1066b9391138aa07ae56be16f721137 100644 --- a/docs/system/device-emulation.rst +++ b/docs/system/device-emulation.rst @@ -98,3 +98,4 @@ Emulated Devices devices/canokey.rst devices/usb-u2f.rst devices/igb.rst + devices/vhost-vdpa-generic-device.rst diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst b/docs/system/devices/vhost-vdpa-generic-device.rst new file mode 100644 index 0000000000000000000000000000000000000000..25fbcac60e64b7e7fcc50bc5105e623bfbe2fcc1 --- /dev/null +++ b/docs/system/devices/vhost-vdpa-generic-device.rst @@ -0,0 +1,46 @@ + +========================= +vhost-vDPA generic device +========================= + +This document explains the usage of the vhost-vDPA generic device. + +Description +----------- + +vDPA(virtio data path acceleration) device is a device that uses a datapath +which complies with the virtio specifications with vendor specific control +path. + +QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one +is type sensitive which means QEMU needs to know the actual device type +(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which +is type insensitive + +The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio +subsystem. It is quite small, but it can support any type of virtio device. + +Examples +-------- + +Prepare the vhost-vDPA backends first: + +:: + host# ls -l /dev/vhost-vdpa-* + crw------- 1 root root 236, 0 Nov 2 00:49 /dev/vhost-vdpa-0 + +Start QEMU with virtio-mmio bus: + +:: + host# qemu-system \ + -M microvm -m 512 -smp 2 -kernel ... -initrd ... \ + -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0 \ + ... + +Start QEMU with virtio-pci bus: + +:: + host# qemu-system \ + -M pc -m 512 -smp 2 \ + -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0 \ + ...\ diff --git a/docs/system/i386/amd-memory-encryption.rst b/docs/system/i386/amd-memory-encryption.rst index e9bc142bc1308dfcae7b6cf375e43bc5dd15f7e2..b7e3f46ff689f1588de12756441afb0974811a2c 100644 --- a/docs/system/i386/amd-memory-encryption.rst +++ b/docs/system/i386/amd-memory-encryption.rst @@ -177,7 +177,45 @@ TODO Live Migration --------------- -TODO +AMD SEV encrypts the memory of VMs and because a different key is used +in each VM, the hypervisor will be unable to simply copy the +ciphertext from one VM to another to migrate the VM. Instead the AMD SEV Key +Management API provides sets of function which the hypervisor can use +to package a guest page for migration, while maintaining the confidentiality +provided by AMD SEV. + +SEV guest VMs have the concept of private and shared memory. The private +memory is encrypted with the guest-specific key, while shared memory may +be encrypted with the hypervisor key. The migration APIs provided by the +SEV API spec should be used for migrating the private pages. The +KVM_GET_PAGE_ENC_BITMAP ioctl can be used to get the guest page encryption +bitmap. The bitmap can be used to check if the given guest page is +private or shared. + +Before initiating the migration, we need to know the targets machine's public +Diffie-Hellman key (PDH) and certificate chain. It can be retrieved +with the 'query-sev-capabilities' QMP command or using the sev-tool. The +migrate-set-parameter can be used to pass the target machine's PDH and +certificate chain. + +During the migration flow, the SEND_START is called on the source hypervisor +to create an outgoing encryption context. The SEV guest policy dictates whether +the certificate passed through the migrate-sev-set-info command will be +validated. SEND_UPDATE_DATA is called to encrypt the guest private pages. +After migration is completed, SEND_FINISH is called to destroy the encryption +context and make the VM non-runnable to protect it against cloning. + +On the target machine, RECEIVE_START is called first to create an +incoming encryption context. The RECEIVE_UPDATE_DATA is called to copy +the received encrypted page into guest memory. After migration has +completed, RECEIVE_FINISH is called to make the VM runnable. + +For more information about the migration see SEV API Appendix A +Usage flow (Live migration section). + +NOTE: +To protect against the memory clone SEV APIs are designed to make the VM +unrunnable in case of the migration failure. References ---------- diff --git a/docs/system/loongarch/virt.rst b/docs/system/loongarch/virt.rst index c37268b404d4d2580439abd2e31b6e3492f765de..aa4719d4bdffd6636984cf67641b9fc5bde4f179 100644 --- a/docs/system/loongarch/virt.rst +++ b/docs/system/loongarch/virt.rst @@ -28,6 +28,37 @@ The ``qemu-system-loongarch64`` provides emulation for virt machine. You can specify the machine type ``virt`` and cpu type ``la464``. +CPU Topology +------------ + +The ``LA464`` type CPUs have the concept of Socket Core and Thread. + +For example: + +``-smp 1,maxcpus=M,sockets=S,cores=C,threads=T`` + +The above parameters indicate that the machine has a maximum of ``M`` vCPUs and +``S`` sockets, each socket has ``C`` cores, each core has ``T`` threads, +and each thread corresponds to a vCPU. + +Then ``M`` ``S`` ``C`` ``T`` has the following relationship: + +``M = S * C * T`` + +In the CPU topology relationship, When we know the ``socket_id`` ``core_id`` +and ``thread_id`` of the CPU, we can calculate its ``arch_id``: + +``arch_id = (socket_id * S) + (core_id * C) + (thread_id * T)`` + +Similarly, when we know the ``arch_id`` of the CPU, +we can also get its ``socket_id`` ``core_id`` and ``thread_id``: + +``socket_id = arch_id / (C * T)`` + +``core_id = (arch_id / T) % C`` + +``thread_id = arch_id % T`` + Boot options ------------ diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index 4459c065f1942b7061a3e8b54352df2004978094..3653adb963ee17f5165b46b053935e3c42e1445e 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -406,7 +406,7 @@ Command description: Compare exits with ``0`` in case the images are equal and with ``1`` in case the images differ. Other exit codes mean an error occurred during execution and standard error output should contain an error message. - The following table sumarizes all exit codes of the compare subcommand: + The following table summarizes all exit codes of the compare subcommand: 0 Images are identical (or requested help was printed) diff --git a/docs/tools/virtfs-proxy-helper.rst b/docs/tools/virtfs-proxy-helper.rst index bd310ebb07b5fc0ae9ceeeb5418cf4edfdd2192e..175b4809260093f85b45904e399b6c9bfb72e829 100644 --- a/docs/tools/virtfs-proxy-helper.rst +++ b/docs/tools/virtfs-proxy-helper.rst @@ -55,7 +55,7 @@ The following options are supported: .. option:: -f, --fd SOCKET_ID Use given file descriptor as socket descriptor for communicating with - qemu proxy fs drier. Usually a helper like libvirt will create + qemu proxy fs driver. Usually a helper like libvirt will create socketpair and pass one of the fds as parameter to this option. .. option:: -s, --socket SOCKET_FILE diff --git a/dump/dump.c b/dump/dump.c index 481905076493c7d723e01f84a08e76ea0e5550e3..787059ac2c8118311124ccbce3f1839219827df8 100644 --- a/dump/dump.c +++ b/dump/dump.c @@ -20,6 +20,7 @@ #include "sysemu/dump.h" #include "sysemu/runstate.h" #include "sysemu/cpus.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "qapi/qapi-commands-dump.h" #include "qapi/qapi-events-dump.h" @@ -2065,6 +2066,12 @@ void qmp_dump_guest_memory(bool paging, const char *protocol, Error **errp) { ERRP_GUARD(); + + if (virtcca_cvm_enabled()) { + error_setg(errp, "The dump-guest-memory command is temporarily unsupported in cvm."); + return; + } + const char *p; int fd; DumpState *s; diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c index 46d752bbc2cd31694af5502aa69d233c3a2548ed..31c3dae52540f0f803a75d9f191c59a4dfe69852 100644 --- a/gdbstub/gdbstub.c +++ b/gdbstub/gdbstub.c @@ -582,6 +582,19 @@ void gdb_register_coprocessor(CPUState *cpu, } } +void gdb_unregister_coprocessor_all(CPUState *cpu) +{ + /* + * Safe to nuke everything. GDBRegisterState::xml is static const char so + * it won't be freed + */ + g_array_free(cpu->gdb_regs, true); + + cpu->gdb_regs = NULL; + cpu->gdb_num_regs = 0; + cpu->gdb_num_g_regs = 0; +} + static void gdb_process_breakpoint_remove_all(GDBProcess *p) { CPUState *cpu = gdb_get_first_cpu_in_process(p); diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index af636cfb2d30564b0c9d7c5c7c993b108c2bd2d0..9a291d1b51dece98fbc5774ae12e72f26f21b4d6 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -2587,6 +2587,11 @@ static void coroutine_fn v9fs_readdir(void *opaque) retval = -EINVAL; goto out_nofid; } + if (fidp->fid_type != P9_FID_DIR) { + warn_report_once("9p: bad client: T_readdir on non-directory stream"); + retval = -ENOTDIR; + goto out; + } if (!fidp->fs.dir.stream) { retval = -EINVAL; goto out; diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c index 3fc4b14c260fb55ffff585860df8ac652516302a..c6c61bb9cd33c1486093af9787a8d0fedc006053 100644 --- a/hw/acpi/acpi-cpu-hotplug-stub.c +++ b/hw/acpi/acpi-cpu-hotplug-stub.c @@ -19,6 +19,12 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner, return; } +void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, + CPUHotplugState *state, hwaddr base_addr) +{ + return; +} + void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list) { return; diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index af66bde0f55d5db5c123958f0f15afd5bbc047b8..3fb996c034c395f91208e97f6984b25069ff7134 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -47,7 +47,7 @@ static void build_prepend_byte(GArray *array, uint8_t val) g_array_prepend_val(array, val); } -static void build_append_byte(GArray *array, uint8_t val) +void build_append_byte(GArray *array, uint8_t val) { g_array_append_val(array, val); } @@ -1554,6 +1554,28 @@ Aml *aml_sleep(uint64_t msec) return var; } +/* ACPI 5.0b: 6.4.3.7 Generic Register Descriptor */ +Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width, + uint8_t reg_offset, AmlAccessType type, uint64_t addr) +{ + int i; + Aml *var = aml_alloc(); + build_append_byte(var->buf, 0x82); /* Generic Register Descriptor */ + build_append_byte(var->buf, 0x0C); /* Length, bits[7:0] value = 0x0C */ + build_append_byte(var->buf, 0); /* Length, bits[15:8] value = 0 */ + build_append_byte(var->buf, rs); /* Address Space ID */ + build_append_byte(var->buf, reg_width); /* Register Bit Width */ + build_append_byte(var->buf, reg_offset); /* Register Bit Offset */ + build_append_byte(var->buf, type); /* Access Size */ + + /* Register address */ + for (i = 0; i < 8; i++) { + build_append_byte(var->buf, extract64(addr, i * 8, 8)); + } + + return var; +} + static uint8_t Hex2Byte(const char *src) { int hi, lo; @@ -1968,10 +1990,10 @@ void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms, * ACPI spec, Revision 6.3 * 5.2.29.1 Processor hierarchy node structure (Type 0) */ -static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, - uint32_t parent, uint32_t id, - uint32_t *priv_rsrc, - uint32_t priv_num) +void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, + uint32_t parent, uint32_t id, + uint32_t *priv_rsrc, + uint32_t priv_num) { int i; @@ -1994,6 +2016,59 @@ static void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, } } +void build_spcr(GArray *table_data, BIOSLinker *linker, + const AcpiSpcrData *f, const uint8_t rev, + const char *oem_id, const char *oem_table_id) +{ + AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id, + .oem_table_id = oem_table_id }; + + acpi_table_begin(&table, table_data); + /* Interface type */ + build_append_int_noprefix(table_data, f->interface_type, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 3); + /* Base Address */ + build_append_gas(table_data, f->base_addr.id, f->base_addr.width, + f->base_addr.offset, f->base_addr.size, + f->base_addr.addr); + /* Interrupt type */ + build_append_int_noprefix(table_data, f->interrupt_type, 1); + /* IRQ */ + build_append_int_noprefix(table_data, f->pc_interrupt, 1); + /* Global System Interrupt */ + build_append_int_noprefix(table_data, f->interrupt, 4); + /* Baud Rate */ + build_append_int_noprefix(table_data, f->baud_rate, 1); + /* Parity */ + build_append_int_noprefix(table_data, f->parity, 1); + /* Stop Bits */ + build_append_int_noprefix(table_data, f->stop_bits, 1); + /* Flow Control */ + build_append_int_noprefix(table_data, f->flow_control, 1); + /* Language */ + build_append_int_noprefix(table_data, f->language, 1); + /* Terminal Type */ + build_append_int_noprefix(table_data, f->terminal_type, 1); + /* PCI Device ID */ + build_append_int_noprefix(table_data, f->pci_device_id, 2); + /* PCI Vendor ID */ + build_append_int_noprefix(table_data, f->pci_vendor_id, 2); + /* PCI Bus Number */ + build_append_int_noprefix(table_data, f->pci_bus, 1); + /* PCI Device Number */ + build_append_int_noprefix(table_data, f->pci_device, 1); + /* PCI Function Number */ + build_append_int_noprefix(table_data, f->pci_function, 1); + /* PCI Flags */ + build_append_int_noprefix(table_data, f->pci_flags, 4); + /* PCI Segment */ + build_append_int_noprefix(table_data, f->pci_segment, 1); + /* Reserved */ + build_append_int_noprefix(table_data, 0, 4); + + acpi_table_end(linker, &table); +} /* * ACPI spec, Revision 6.3 * 5.2.29 Processor Properties Topology Table (PPTT) diff --git a/hw/acpi/core.c b/hw/acpi/core.c index ec5e127d17226ee9a7e51860eb64b897ba3f7d72..b6241f70e9eeebe11907b8e1f97acc4d79741947 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -24,6 +24,7 @@ #include "hw/acpi/acpi.h" #include "hw/nvram/fw_cfg.h" #include "qemu/config-file.h" +#include "qemu/log.h" #include "qapi/error.h" #include "qapi/opts-visitor.h" #include "qapi/qapi-events-run-state.h" @@ -588,13 +589,16 @@ static void acpi_pm_cnt_write(void *opaque, hwaddr addr, uint64_t val, uint16_t sus_typ = (val >> 10) & 7; switch (sus_typ) { case 0: /* soft power off */ + qemu_log("VM will be soft power off\n"); qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); break; case 1: + qemu_log("VM will be suspend state\n"); qemu_system_suspend_request(); break; default: if (sus_typ == ar->pm1.cnt.s4_val) { /* S4 request */ + qemu_log("VM will be S4 state\n"); qapi_event_send_suspend_disk(); qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); } diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 011d2c6c2ddf94f1b3f9af99436f99c11655deea..9a3b417504776daaa3ffc1d62c4fac065ad1fba6 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -1,13 +1,13 @@ #include "qemu/osdep.h" #include "migration/vmstate.h" #include "hw/acpi/cpu.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/core/cpu.h" #include "qapi/error.h" #include "qapi/qapi-events-acpi.h" #include "trace.h" #include "sysemu/numa.h" -#define ACPI_CPU_HOTPLUG_REG_LEN 12 #define ACPI_CPU_SELECTOR_OFFSET_WR 0 #define ACPI_CPU_FLAGS_OFFSET_RW 4 #define ACPI_CPU_CMD_OFFSET_WR 5 @@ -64,10 +64,11 @@ static uint64_t cpu_hotplug_rd(void *opaque, hwaddr addr, unsigned size) cdev = &cpu_st->devs[cpu_st->selector]; switch (addr) { case ACPI_CPU_FLAGS_OFFSET_RW: /* pack and return is_* fields */ - val |= cdev->cpu ? 1 : 0; + val |= cdev->is_enabled ? 1 : 0; val |= cdev->is_inserting ? 2 : 0; val |= cdev->is_removing ? 4 : 0; val |= cdev->fw_remove ? 16 : 0; + val |= cdev->is_present ? 32 : 0; trace_cpuhp_acpi_read_flags(cpu_st->selector, val); break; case ACPI_CPU_CMD_DATA_OFFSET_RW: @@ -226,7 +227,20 @@ void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, state->dev_count = id_list->len; state->devs = g_new0(typeof(*state->devs), state->dev_count); for (i = 0; i < id_list->len; i++) { - state->devs[i].cpu = CPU(id_list->cpus[i].cpu); + struct CPUState *cpu = CPU(id_list->cpus[i].cpu); + if (qemu_present_cpu(cpu)) { + state->devs[i].is_present = true; + } else { + state->devs[i].is_present = false; + } + + if (qemu_enabled_cpu(cpu)) { + state->devs[i].cpu = cpu; + state->devs[i].is_enabled = true; + } else { + state->devs[i].is_enabled = false; + } + state->devs[i].arch_id = id_list->cpus[i].arch_id; } memory_region_init_io(&state->ctrl_reg, owner, &cpu_hotplug_ops, state, @@ -259,6 +273,8 @@ void acpi_cpu_plug_cb(HotplugHandler *hotplug_dev, } cdev->cpu = CPU(dev); + cdev->is_present = true; + cdev->is_enabled = true; if (dev->hotplugged) { cdev->is_inserting = true; acpi_send_event(DEVICE(hotplug_dev), ACPI_CPU_HOTPLUG_STATUS); @@ -290,16 +306,23 @@ void acpi_cpu_unplug_cb(CPUHotplugState *cpu_st, return; } + cdev->is_enabled = false; + if (!qemu_persistent_cpu(CPU(dev))) { + cdev->is_present = false; + } + cdev->cpu = NULL; } static const VMStateDescription vmstate_cpuhp_sts = { .name = "CPU hotplug device state", - .version_id = 1, + .version_id = 2, .minimum_version_id = 1, .fields = (VMStateField[]) { VMSTATE_BOOL(is_inserting, AcpiCpuStatus), VMSTATE_BOOL(is_removing, AcpiCpuStatus), + VMSTATE_BOOL(is_present, AcpiCpuStatus), + VMSTATE_BOOL(is_enabled, AcpiCpuStatus), VMSTATE_UINT32(ost_event, AcpiCpuStatus), VMSTATE_UINT32(ost_status, AcpiCpuStatus), VMSTATE_END_OF_LIST() @@ -337,11 +360,15 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_REMOVE_EVENT "CRMV" #define CPU_EJECT_EVENT "CEJ0" #define CPU_FW_EJECT_EVENT "CEJF" +#define CPU_PRESENT "CPRS" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, + build_cpu_cppc_fn build_cpu_cppc, + hwaddr base_addr, const char *res_root, - const char *event_handler_method) + const char *event_handler_method, + AmlRegionSpace rs) { Aml *ifctx; Aml *field; @@ -365,14 +392,22 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_name_decl("_UID", aml_string("CPU Hotplug resources"))); aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0)); + assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY)); + crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1, + if (rs == AML_SYSTEM_IO) { + aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1, ACPI_CPU_HOTPLUG_REG_LEN)); + } else if (rs == AML_SYSTEM_MEMORY) { + aml_append(crs, aml_memory32_fixed(base_addr, + ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE)); + } + aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs)); /* declare CPU hotplug MMIO region with related access fields */ aml_append(cpu_ctrl_dev, - aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base), + aml_operation_region("PRST", rs, aml_int(base_addr), ACPI_CPU_HOTPLUG_REG_LEN)); field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK, @@ -388,7 +423,9 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(field, aml_named_field(CPU_EJECT_EVENT, 1)); /* tell firmware to do device eject, write only */ aml_append(field, aml_named_field(CPU_FW_EJECT_EVENT, 1)); - aml_append(field, aml_reserved_field(3)); + /* 1 if present, read only */ + aml_append(field, aml_named_field(CPU_PRESENT, 1)); + aml_append(field, aml_reserved_field(2)); aml_append(field, aml_named_field(CPU_COMMAND, 8)); aml_append(cpu_ctrl_dev, field); @@ -418,6 +455,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, Aml *ctrl_lock = aml_name("%s.%s", cphp_res_path, CPU_LOCK); Aml *cpu_selector = aml_name("%s.%s", cphp_res_path, CPU_SELECTOR); Aml *is_enabled = aml_name("%s.%s", cphp_res_path, CPU_ENABLED); + Aml *is_present = aml_name("%s.%s", cphp_res_path, CPU_PRESENT); Aml *cpu_cmd = aml_name("%s.%s", cphp_res_path, CPU_COMMAND); Aml *cpu_data = aml_name("%s.%s", cphp_res_path, CPU_DATA); Aml *ins_evt = aml_name("%s.%s", cphp_res_path, CPU_INSERT_EVENT); @@ -446,13 +484,26 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, { Aml *idx = aml_arg(0); Aml *sta = aml_local(0); + Aml *ifctx2; + Aml *else_ctx; aml_append(method, aml_acquire(ctrl_lock, 0xFFFF)); aml_append(method, aml_store(idx, cpu_selector)); aml_append(method, aml_store(zero, sta)); - ifctx = aml_if(aml_equal(is_enabled, one)); + ifctx = aml_if(aml_equal(is_present, one)); { - aml_append(ifctx, aml_store(aml_int(0xF), sta)); + ifctx2 = aml_if(aml_equal(is_enabled, one)); + { + /* cpu is present and enabled */ + aml_append(ifctx2, aml_store(aml_int(0xF), sta)); + } + aml_append(ifctx, ifctx2); + else_ctx = aml_else(); + { + /* cpu is present but disabled */ + aml_append(else_ctx, aml_store(aml_int(0xD), sta)); + } + aml_append(ifctx, else_ctx); } aml_append(method, ifctx); aml_append(method, aml_release(ctrl_lock)); @@ -658,14 +709,20 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(dev, aml_name_decl("_UID", uid)); } + if (build_cpu_cppc) { + build_cpu_cppc(i, arch_ids->len, dev); + } + method = aml_method("_STA", 0, AML_SERIALIZED); aml_append(method, aml_return(aml_call1(CPU_STS_METHOD, uid))); aml_append(dev, method); /* build _MAT object */ - build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */ - aml_append(dev, aml_name_decl("_MAT", - aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data))); + if (build_madt_cpu) { + build_madt_cpu(i, arch_ids, madt_buf, true); /* set enabled flag */ + aml_append(dev, aml_name_decl("_MAT", + aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data))); + } g_array_free(madt_buf, true); if (CPU(arch_ids->cpus[i].cpu) != first_cpu) { @@ -696,9 +753,11 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_append(sb_scope, cpus_dev); aml_append(table, sb_scope); - method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED); - aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD)); - aml_append(table, method); + if (event_handler_method) { + method = aml_method(event_handler_method, 0, AML_NOTSERIALIZED); + aml_append(method, aml_call0("\\_SB.CPUS." CPU_SCAN_METHOD)); + aml_append(table, method); + } g_free(cphp_res_path); } diff --git a/hw/acpi/cpufreq.c b/hw/acpi/cpufreq.c new file mode 100644 index 0000000000000000000000000000000000000000..a76f7b8fa2c86e790f6e66d73093e388e9bd8a21 --- /dev/null +++ b/hw/acpi/cpufreq.c @@ -0,0 +1,280 @@ +/* + * ACPI CPPC register device + * + * Support for showing CPU frequency in guest OS. + * + * Copyright (c) 2019 HUAWEI TECHNOLOGIES CO.,LTD. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "hw/sysbus.h" +#include "chardev/char.h" +#include "qemu/log.h" +#include "trace.h" +#include "qemu/option.h" +#include "sysemu/sysemu.h" +#include "hw/acpi/acpi-defs.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "hw/boards.h" + +#define TYPE_CPUFREQ "cpufreq" +#define CPUFREQ(obj) OBJECT_CHECK(CpuhzState, (obj), TYPE_CPUFREQ) +#define NOMINAL_FREQ_FILE "/sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq" +#define CPU_MAX_FREQ_FILE "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq" +#define HZ_MAX_LENGTH 1024 +#define MAX_SUPPORT_SPACE 0x10000 + +/* + * Since Hi1616 will not support CPPC, we simply use its nominal frequency as + * the default. + */ +#define DEFAULT_HZ 2400 + +int cppc_regs_offset[CPPC_REG_COUNT] = { + [HIGHEST_PERF] = 0, + [NOMINAL_PERF] = 4, + [LOW_NON_LINEAR_PERF] = 8, + [LOWEST_PERF] = 12, + [GUARANTEED_PERF] = 16, + [DESIRED_PERF] = 20, + [MIN_PERF] = -1, + [MAX_PERF] = -1, + [PERF_REDUC_TOLERANCE] = -1, + [TIME_WINDOW] = -1, + [CTR_WRAP_TIME] = -1, + [REFERENCE_CTR] = 24, + [DELIVERED_CTR] = 32, + [PERF_LIMITED] = 40, + [ENABLE] = -1, + [AUTO_SEL_ENABLE] = -1, + [AUTO_ACT_WINDOW] = -1, + [ENERGY_PERF] = -1, + [REFERENCE_PERF] = -1, + [LOWEST_FREQ] = 44, + [NOMINAL_FREQ] = 48, +}; + +typedef struct CpuhzState { + SysBusDevice parent_obj; + + MemoryRegion iomem; + uint32_t HighestPerformance; + uint32_t NominalPerformance; + uint32_t LowestNonlinearPerformance; + uint32_t LowestPerformance; + uint32_t GuaranteedPerformance; + uint32_t DesiredPerformance; + uint64_t ReferencePerformanceCounter; + uint64_t DeliveredPerformanceCounter; + uint32_t PerformanceLimited; + uint32_t LowestFreq; + uint32_t NominalFreq; + uint32_t num_cpu; + uint32_t reg_size; +} CpuhzState; + + +static uint64_t cpufreq_read(void *opaque, hwaddr offset, unsigned size) +{ + CpuhzState *s = (CpuhzState *)opaque; + uint64_t r; + uint64_t n; + + if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) { + warn_report("cpufreq_read: offset 0x%lx out of range", offset); + return 0; + } + + n = offset % CPPC_REG_PER_CPU_STRIDE; + switch (n) { + case 0: + r = s->HighestPerformance; + break; + case 4: + r = s->NominalPerformance; + break; + case 8: + r = s->LowestNonlinearPerformance; + break; + case 12: + r = s->LowestPerformance; + break; + case 16: + r = s->GuaranteedPerformance; + break; + case 20: + r = s->DesiredPerformance; + break; + /* + * We don't have real counters and it is hard to emulate, so always set the + * counter value to 1 to rely on Linux to use the DesiredPerformance value + * directly. + */ + case 24: + r = s->ReferencePerformanceCounter; + break; + /* + * Guest may still access the register by 32bit; add the process to + * eliminate unnecessary warnings. + */ + case 28: + r = s->ReferencePerformanceCounter >> 32; + break; + case 32: + r = s->DeliveredPerformanceCounter; + break; + case 36: + r = s->DeliveredPerformanceCounter >> 32; + break; + + case 40: + r = s->PerformanceLimited; + break; + case 44: + r = s->LowestFreq; + break; + case 48: + r = s->NominalFreq; + break; + default: + error_printf("cpufreq_read: Bad offset 0x%lx\n", offset); + r = 0; + break; + } + return r; +} + +static void cpufreq_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ + CpuhzState *s = CPUFREQ(opaque); + uint64_t n; + + if (offset >= s->num_cpu * CPPC_REG_PER_CPU_STRIDE) { + error_printf("cpufreq_write: offset 0x%lx out of range", offset); + return; + } + + n = offset % CPPC_REG_PER_CPU_STRIDE; + + switch (n) { + case 20: + break; + default: + error_printf("cpufreq_write: Bad offset 0x%lx\n", offset); + } +} + +static uint32_t CPPC_Read(const char *hostpath) +{ + int fd; + char buffer[HZ_MAX_LENGTH] = { 0 }; + uint64_t hz; + int len; + const char *endptr = NULL; + int ret; + + fd = qemu_open_old(hostpath, O_RDONLY); + if (fd < 0) { + return 0; + } + + len = read(fd, buffer, HZ_MAX_LENGTH); + qemu_close(fd); + if (len <= 0) { + return 0; + } + ret = qemu_strtoul(buffer, &endptr, 0, &hz); + if (ret < 0) { + return 0; + } + return (uint32_t)hz; +} + +static const MemoryRegionOps cpufreq_ops = { + .read = cpufreq_read, + .write = cpufreq_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void hz_init(CpuhzState *s) +{ + uint32_t hz; + + hz = CPPC_Read(NOMINAL_FREQ_FILE); + if (hz == 0) { + hz = CPPC_Read(CPU_MAX_FREQ_FILE); + if (hz == 0) { + hz = DEFAULT_HZ; + } else { + /* Value in CpuMaxFrequency is in KHz unit; convert to MHz */ + hz = hz / 1000; + } + } + + s->HighestPerformance = hz; + s->NominalPerformance = hz; + s->LowestNonlinearPerformance = hz; + s->LowestPerformance = hz; + s->GuaranteedPerformance = hz; + s->DesiredPerformance = hz; + s->ReferencePerformanceCounter = 1; + s->DeliveredPerformanceCounter = 1; + s->PerformanceLimited = 0; + s->LowestFreq = hz; + s->NominalFreq = hz; +} + +static void cpufreq_init(Object *obj) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(obj); + CpuhzState *s = CPUFREQ(obj); + + MachineState *ms = MACHINE(qdev_get_machine()); + s->num_cpu = ms->smp.max_cpus; + + s->reg_size = s->num_cpu * CPPC_REG_PER_CPU_STRIDE; + if (s->reg_size > MAX_SUPPORT_SPACE) { + error_report("Required space 0x%x excesses the max support 0x%x", + s->reg_size, MAX_SUPPORT_SPACE); + goto err_end; + } + + memory_region_init_io(&s->iomem, OBJECT(s), &cpufreq_ops, s, "cpufreq", + s->reg_size); + sysbus_init_mmio(sbd, &s->iomem); + hz_init(s); + return; + +err_end: + /* Set desired perf register offset to -1 to indicate no support for CPPC */ + cppc_regs_offset[DESIRED_PERF] = -1; +} + +static const TypeInfo cpufreq_arm_info = { + .name = TYPE_CPUFREQ, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(CpuhzState), + .instance_init = cpufreq_init, +}; + +static void cpufreq_register_types(void) +{ + type_register_static(&cpufreq_arm_info); +} + +type_init(cpufreq_register_types) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index a3d31631fe0dba5b797baecbf85432c4a8e7e8e7..755653dc265c7c2242a68ee144052c7b74b47b22 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "hw/acpi/acpi.h" +#include "hw/acpi/cpu.h" #include "hw/acpi/generic_event_device.h" #include "hw/irq.h" #include "hw/mem/pc-dimm.h" @@ -25,6 +26,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_MEM_HOTPLUG_EVT, ACPI_GED_PWR_DOWN_EVT, ACPI_GED_NVDIMM_HOTPLUG_EVT, + ACPI_GED_CPU_HOTPLUG_EVT, }; /* @@ -107,6 +109,10 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "." MEMORY_SLOT_SCAN_METHOD)); break; + case ACPI_GED_CPU_HOTPLUG_EVT: + aml_append(if_ctx, aml_call0(ACPI_CPU_CONTAINER "." + ACPI_CPU_SCAN_METHOD)); + break; case ACPI_GED_PWR_DOWN_EVT: aml_append(if_ctx, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), @@ -197,9 +203,9 @@ static void ged_regs_write(void *opaque, hwaddr addr, uint64_t data, switch (addr) { case ACPI_GED_REG_SLEEP_CTL: - slp_typ = (data >> 2) & 0x07; - slp_en = (data >> 5) & 0x01; - if (slp_en && slp_typ == 5) { + slp_typ = (data >> ACPI_GED_SLP_TYP_POS) & ACPI_GED_SLP_TYP_MASK; + slp_en = !!(data & ACPI_GED_SLP_EN); + if (slp_en && slp_typ == ACPI_GED_SLP_TYP_S5) { qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); } return; @@ -234,6 +240,8 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev, } else { acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "virt: device plug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -248,6 +256,8 @@ static void acpi_ged_unplug_request_cb(HotplugHandler *hotplug_dev, if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) && !(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) { acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -261,6 +271,8 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev, if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { acpi_memory_unplug_cb(&s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -272,6 +284,7 @@ static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) AcpiGedState *s = ACPI_GED(adev); acpi_memory_ospm_status(&s->memhp_state, list); + acpi_cpu_ospm_status(&s->cpuhp_state, list); } static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) @@ -286,6 +299,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_PWR_DOWN_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; + } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { + sel = ACPI_GED_CPU_HOTPLUG_EVT; } else { /* Unknown event. Return without generating interrupt. */ warn_report("GED: Unsupported event %d. No irq injected", ev); @@ -318,6 +333,16 @@ static const VMStateDescription vmstate_memhp_state = { } }; +static const VMStateDescription vmstate_cpuhp_state = { + .name = "acpi-ged/cpuhp", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_CPU_HOTPLUG(cpuhp_state, AcpiGedState), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_ged_state = { .name = "acpi-ged-state", .version_id = 1, @@ -366,17 +391,55 @@ static const VMStateDescription vmstate_acpi_ged = { }, .subsections = (const VMStateDescription * []) { &vmstate_memhp_state, + &vmstate_cpuhp_state, &vmstate_ghes_state, NULL } }; +static void acpi_ged_realize(DeviceState *dev, Error **errp) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + AcpiGedState *s = ACPI_GED(dev); + uint32_t ged_events; + int i; + + ged_events = ctpop32(s->ged_event_bitmap); + + for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) { + uint32_t event = s->ged_event_bitmap & ged_supported_events[i]; + + if (!event) { + continue; + } + + switch (event) { + case ACPI_GED_CPU_HOTPLUG_EVT: + /* initialize CPU Hotplug related regions */ + memory_region_init(&s->container_cpuhp, OBJECT(dev), + "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(sbd, &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + break; + } + ged_events--; + } + + if (ged_events) { + error_report("Unsupported events specified"); + abort(); + } +} + static void acpi_ged_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); AcpiGedState *s = ACPI_GED(dev); SysBusDevice *sbd = SYS_BUS_DEVICE(obj); GEDState *ged_st = &s->ged_state; + MachineClass *mc; memory_region_init_io(&ged_st->evt, obj, &ged_evt_ops, ged_st, TYPE_ACPI_GED, ACPI_GED_EVT_SEL_LEN); @@ -400,6 +463,15 @@ static void acpi_ged_initfn(Object *obj) memory_region_init_io(&ged_st->regs, obj, &ged_regs_ops, ged_st, TYPE_ACPI_GED "-regs", ACPI_GED_REG_COUNT); sysbus_init_mmio(sbd, &ged_st->regs); + + mc = MACHINE_GET_CLASS(qdev_get_machine()); + if (mc->possible_cpu_arch_ids) { + memory_region_init(&s->container_cpuhp, OBJECT(dev), "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + } } static void acpi_ged_class_init(ObjectClass *class, void *data) @@ -411,6 +483,7 @@ static void acpi_ged_class_init(ObjectClass *class, void *data) dc->desc = "ACPI Generic Event Device"; device_class_set_props(dc, acpi_ged_properties); dc->vmsd = &vmstate_acpi_ged; + dc->realize = acpi_ged_realize; hc->plug = acpi_ged_device_plug_cb; hc->unplug_request = acpi_ged_unplug_request_cb; diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build index fc1b952379a88a74f336e6ae50614ee3c25eb2d1..d36b10ea3c7e9128c23319aed19178c3e2c71ec3 100644 --- a/hw/acpi/meson.build +++ b/hw/acpi/meson.build @@ -27,6 +27,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_ICH9', if_true: files('ich9.c', 'ich9_tco.c')) acpi_ss.add(when: 'CONFIG_ACPI_ERST', if_true: files('erst.c')) acpi_ss.add(when: 'CONFIG_IPMI', if_true: files('ipmi.c'), if_false: files('ipmi-stub.c')) acpi_ss.add(when: 'CONFIG_PC', if_false: files('acpi-x86-stub.c')) +acpi_ss.add(when: 'CONFIG_CPUFREQ', if_true: files('cpufreq.c')) if have_tpm acpi_ss.add(files('tpm.c')) endif diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index 3ada335a24374d4d543bc14c19b0a0b0bd1505b3..4a0ea0628f99ccc49ede3ffadc7e2aeab6f45478 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -8,6 +8,7 @@ config ARM_VIRT imply TPM_TIS_SYSBUS imply TPM_TIS_I2C imply NVDIMM + imply IOMMUFD select ARM_GIC select ACPI select ARM_SMMUV3 @@ -29,6 +30,7 @@ config ARM_VIRT select ACPI_HW_REDUCED select ACPI_APEI select ACPI_VIOT + select ACPI_CPU_HOTPLUG select VIRTIO_MEM_SUPPORTED select ACPI_CXL select ACPI_HMAT diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 84ea6a807a4eddd998cb48934a773ede5146fdf1..9a33601d358c1c3610c7c9a6ae974e66027a3ad4 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -11,6 +11,7 @@ #include "qemu/datadir.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qemu/log.h" #include #include "hw/arm/boot.h" #include "hw/arm/linux-boot-if.h" @@ -26,6 +27,7 @@ #include "qemu/config-file.h" #include "qemu/option.h" #include "qemu/units.h" +#include "kvm_arm.h" /* Kernel boot protocol is specified in the kernel docs * Documentation/arm/Booting and Documentation/arm64/booting.txt @@ -41,6 +43,9 @@ #define BOOTLOADER_MAX_SIZE (4 * KiB) +#define UEFI_MAX_SIZE 0x8000000 +#define UEFI_LOADER_START 0x0 +#define DTB_MAX 0x200000 AddressSpace *arm_boot_address_space(ARMCPU *cpu, const struct arm_boot_info *info) { @@ -682,7 +687,7 @@ fail: return -1; } -static void do_cpu_reset(void *opaque) +void do_cpu_reset(void *opaque) { ARMCPU *cpu = opaque; CPUState *cs = CPU(cpu); @@ -1141,10 +1146,59 @@ static void arm_setup_direct_kernel_boot(ARMCPU *cpu, for (cs = first_cpu; cs; cs = CPU_NEXT(cs)) { ARM_CPU(cs)->env.boot_info = info; } + + if (kvm_enabled() && virtcca_cvm_enabled()) { + if (info->dtb_limit == 0) { + info->dtb_limit = info->dtb_start + DTB_MAX; + } + kvm_load_user_data(info->loader_start, 0x1, info->dtb_start, + info->dtb_limit - info->dtb_start, info->ram_size, (struct kvm_numa_info *)info->numa_info); + tmm_add_ram_region(info->loader_start, image_high_addr - info->loader_start, + info->initrd_start, info->dtb_limit - info->initrd_start, true); + } +} + +static void arm_setup_confidential_firmware_boot(ARMCPU *cpu, + struct arm_boot_info *info, + const char *firmware_filename) +{ + uint64_t tmi_version = 0; + int ret = -1; + + if (kvm_enabled()) { + ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); + } + if (ret < 0) { + error_report("please check the kernel version!"); + exit(EXIT_FAILURE); + } + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { + error_report("please check the tmi version!"); + exit(EXIT_FAILURE); + } + ssize_t fw_size; + const char *fname; + AddressSpace *as = arm_boot_address_space(cpu, info); + + fname = qemu_find_file(QEMU_FILE_TYPE_BIOS, firmware_filename); + if (!fname) { + error_report("Could not find firmware image '%s'", firmware_filename); + exit(EXIT_FAILURE); + } + + fw_size = load_image_targphys_as(firmware_filename, + info->firmware_base, + info->firmware_max_size, as); + + if (fw_size <= 0) { + error_report("could not load firmware '%s'", firmware_filename); + exit(EXIT_FAILURE); + } } -static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) +static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info, const char *firmware_filename) { + hwaddr mmio_start, mmio_size; /* Set up for booting firmware (which might load a kernel via fw_cfg) */ if (have_dtb(info)) { @@ -1154,6 +1208,8 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) * DTB to the base of RAM for the bootloader to pick up. */ info->dtb_start = info->loader_start; + if (info->confidential) + tmm_add_ram_region(UEFI_LOADER_START, UEFI_MAX_SIZE, info->dtb_start, DTB_MAX , true); } if (info->kernel_filename) { @@ -1194,6 +1250,12 @@ static void arm_setup_firmware_boot(ARMCPU *cpu, struct arm_boot_info *info) } } + if (info->confidential) { + arm_setup_confidential_firmware_boot(cpu, info, firmware_filename); + virtcca_kvm_get_mmio_addr(&mmio_start, &mmio_size); + kvm_load_user_data(info->loader_start, DTB_MAX, mmio_start, mmio_size, info->ram_size, + (struct kvm_numa_info *)info->numa_info); + } /* * We will start from address 0 (typically a boot ROM image) in the * same way as hardware. Leave env->boot_info NULL, so that @@ -1226,19 +1288,60 @@ void arm_load_kernel(ARMCPU *cpu, MachineState *ms, struct arm_boot_info *info) * doesn't support secure. */ assert(!(info->secure_board_setup && kvm_enabled())); + + qemu_log("load the kernel\n"); + info->kernel_filename = ms->kernel_filename; info->kernel_cmdline = ms->kernel_cmdline; info->initrd_filename = ms->initrd_filename; info->dtb_filename = ms->dtb; info->dtb_limit = 0; + if (kvm_enabled() && virtcca_cvm_enabled()) { + info->ram_size = ms->ram_size; + info->numa_info = g_malloc(sizeof(struct kvm_numa_info)); + struct kvm_numa_info *numa_info = (struct kvm_numa_info *) info->numa_info; + if (ms->numa_state != NULL && ms->numa_state->num_nodes > 0) { + numa_info->numa_cnt = ms->numa_state->num_nodes; + uint64_t mem_base = info->loader_start; + for (int64_t i = 0; i < ms->numa_state->num_nodes && i < MAX_NUMA_NODE; i++) { + uint64_t mem_len = ms->numa_state->nodes[i].node_mem; + numa_info->numa_nodes[i].numa_id = i; + numa_info->numa_nodes[i].ipa_start = mem_base; + numa_info->numa_nodes[i].ipa_size = mem_len; + memcpy(numa_info->numa_nodes[i].host_numa_nodes, ms->numa_state->nodes[i].node_memdev->host_nodes, + MAX_NODES / BITS_PER_LONG * sizeof(uint64_t)); + mem_base += mem_len; + } + } else { + numa_info->numa_cnt = 1; + numa_info->numa_nodes[0].numa_id = 0; + numa_info->numa_nodes[0].ipa_start = info->loader_start; + numa_info->numa_nodes[0].ipa_size = info->ram_size; + memset(numa_info->numa_nodes[0].host_numa_nodes, 0, MAX_NODES / BITS_PER_LONG * sizeof(uint64_t)); + } + + for (int cpu_idx = ms->smp.cpus - 1; cpu_idx >= 0; cpu_idx--) { + ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu_idx)); + CPUState *local_cs = CPU(armcpu); + uint64_t node_id = 0; + if (ms->possible_cpus->cpus[local_cs->cpu_index].props.has_node_id) + node_id = ms->possible_cpus->cpus[local_cs->cpu_index].props.node_id; + bitmap_set((unsigned long *)numa_info->numa_nodes[node_id].cpu_id, cpu_idx, 1); + } + } /* Load the kernel. */ if (!info->kernel_filename || info->firmware_loaded) { - arm_setup_firmware_boot(cpu, info); + arm_setup_firmware_boot(cpu, info, ms->firmware); } else { arm_setup_direct_kernel_boot(cpu, info); } + if (kvm_enabled() && virtcca_cvm_enabled()) { + g_free(info->numa_info); + info->numa_info = NULL; + } + /* * Disable the PSCI conduit if it is set up to target the same * or a lower EL than the one we're going to start the guest code in. diff --git a/hw/arm/mps2-tz.c b/hw/arm/mps2-tz.c index 668db5ed61962e4bbc2c0a3945e8ba704e16b88d..9d9c263ef8244c2634bf06266efd7d7ff953f1c6 100644 --- a/hw/arm/mps2-tz.c +++ b/hw/arm/mps2-tz.c @@ -435,7 +435,7 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque, const char *name, hwaddr size, const int *irqs, const PPCExtraData *extradata) { - /* The irq[] array is tx, rx, combined, in that order */ + /* The irq[] array is rx, tx, combined, in that order */ MPS2TZMachineClass *mmc = MPS2TZ_MACHINE_GET_CLASS(mms); CMSDKAPBUART *uart = opaque; int i = uart - &mms->uart[0]; @@ -447,8 +447,8 @@ static MemoryRegion *make_uart(MPS2TZMachineState *mms, void *opaque, qdev_prop_set_uint32(DEVICE(uart), "pclk-frq", mmc->apb_periph_frq); sysbus_realize(SYS_BUS_DEVICE(uart), &error_fatal); s = SYS_BUS_DEVICE(uart); - sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[0])); - sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[1])); + sysbus_connect_irq(s, 0, get_sse_irq_in(mms, irqs[1])); + sysbus_connect_irq(s, 1, get_sse_irq_in(mms, irqs[0])); sysbus_connect_irq(s, 2, qdev_get_gpio_in(orgate_dev, i * 2)); sysbus_connect_irq(s, 3, qdev_get_gpio_in(orgate_dev, i * 2 + 1)); sysbus_connect_irq(s, 4, get_sse_irq_in(mms, irqs[2])); diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index 9a8ac45431abb80fc5e9f60104248cb356509088..6c4b82757fef510cb6a497da737fa91fa94ebcfe 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -20,6 +20,7 @@ #include "trace.h" #include "exec/target_page.h" #include "hw/core/cpu.h" +#include "hw/pci/pci_device.h" #include "hw/qdev-properties.h" #include "qapi/error.h" #include "qemu/jhash.h" @@ -75,6 +76,16 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg, uint8_t level = 4 - (inputsize - 4) / stride; SMMUTLBEntry *entry = NULL; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return NULL; + } + while (level <= 3) { uint64_t subpage_size = 1ULL << level_shift(level, tt->granule_sz); uint64_t mask = subpage_size - 1; @@ -110,6 +121,16 @@ void smmu_iotlb_insert(SMMUState *bs, SMMUTransCfg *cfg, SMMUTLBEntry *new) SMMUIOTLBKey *key = g_new0(SMMUIOTLBKey, 1); uint8_t tg = (new->granule - 10) / 2; + /* + * Stage-1 translation with a nested SMMU in general uses HW IOTLB. However, + * KVM still requests for an iommu address space for an MSI fixup by looking + * up stage-1 page table. Make sure we don't go through the emulated pathway + * so that the emulated iotlb will not need any invalidation. + */ + if (bs->nested) { + return; + } + if (g_hash_table_size(bs->iotlb) >= SMMU_IOTLB_MAX_SIZE) { smmu_iotlb_inv_all(bs); } @@ -569,12 +590,9 @@ SMMUPciBus *smmu_find_smmu_pcibus(SMMUState *s, uint8_t bus_num) return NULL; } -static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +static SMMUPciBus *smmu_get_sbus(SMMUState *s, PCIBus *bus) { - SMMUState *s = opaque; SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); - SMMUDevice *sdev; - static unsigned int index; if (!sbus) { sbus = g_malloc0(sizeof(SMMUPciBus) + @@ -583,7 +601,15 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) g_hash_table_insert(s->smmu_pcibus_by_busptr, bus, sbus); } - sdev = sbus->pbdev[devfn]; + return sbus; +} + +static SMMUDevice *smmu_get_sdev(SMMUState *s, SMMUPciBus *sbus, + PCIBus *bus, int devfn) +{ + SMMUDevice *sdev = sbus->pbdev[devfn]; + static unsigned int index; + if (!sdev) { char *name = g_strdup_printf("%s-%d-%d", s->mrtypename, devfn, index++); @@ -596,37 +622,386 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) memory_region_init_iommu(&sdev->iommu, sizeof(sdev->iommu), s->mrtypename, OBJECT(s), name, UINT64_MAX); + if (s->nested) { + address_space_init(&sdev->as_sysmem, &s->root, name); + } address_space_init(&sdev->as, MEMORY_REGION(&sdev->iommu), name); trace_smmu_add_mr(name); g_free(name); } - return &sdev->as; + return sdev; +} + +static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn) +{ + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + bool is_vfio = false; + PCIDevice *pdev; + + pdev = pci_find_device(bus, pci_bus_num(bus), devfn); + if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + is_vfio = true; + } + + /* Return the system as if the device uses stage-2 only */ + if (s->nested && !sdev->s1_hwpt && is_vfio) { + return &sdev->as_sysmem; + } else { + return &sdev->as; + } +} + +static bool smmu_dev_attach_viommu(SMMUDevice *sdev, + HostIOMMUDeviceIOMMUFD *idev, Error **errp) +{ + struct iommu_hwpt_arm_smmuv3 bypass_data = { + .ste = { 0x9ULL, 0x0ULL }, //0x1ULL << (108 - 64) }, + }; + struct iommu_hwpt_arm_smmuv3 abort_data = { + .ste = { 0x1ULL, 0x0ULL }, + }; + SMMUState *s = sdev->smmu; + SMMUS2Hwpt *s2_hwpt; + SMMUViommu *viommu; + uint32_t s2_hwpt_id; + + if (s->viommu) { + return host_iommu_device_iommufd_attach_hwpt( + idev, s->viommu->s2_hwpt->hwpt_id, errp); + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, idev->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &s2_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate an S2 hwpt"); + return false; + } + + /* Attach to S2 for MSI cookie */ + if (!host_iommu_device_iommufd_attach_hwpt(idev, s2_hwpt_id, errp)) { + error_setg(errp, "failed to attach stage-2 HW pagetable"); + goto free_s2_hwpt; + } + + viommu = g_new0(SMMUViommu, 1); + + viommu->core = iommufd_backend_alloc_viommu(idev->iommufd, idev->devid, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3, + s2_hwpt_id); + if (!viommu->core) { + error_setg(errp, "failed to allocate a viommu"); + goto free_viommu; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(abort_data), &abort_data, + &viommu->abort_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate an abort pagetable"); + goto free_viommu_core; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, 0, + IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(bypass_data), &bypass_data, + &viommu->bypass_hwpt_id, NULL, errp)) { + error_setg(errp, "failed to allocate a bypass pagetable"); + goto free_abort_hwpt; + } + + if (!host_iommu_device_iommufd_attach_hwpt( + idev, viommu->bypass_hwpt_id, errp)) { + error_setg(errp, "failed to attach the bypass pagetable"); + goto free_bypass_hwpt; + } + + s2_hwpt = g_new0(SMMUS2Hwpt, 1); + s2_hwpt->iommufd = idev->iommufd; + s2_hwpt->hwpt_id = s2_hwpt_id; + s2_hwpt->ioas_id = idev->ioas_id; + + viommu->iommufd = idev->iommufd; + viommu->s2_hwpt = s2_hwpt; + + s->viommu = viommu; + return true; + +free_bypass_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->bypass_hwpt_id); +free_abort_hwpt: + iommufd_backend_free_id(idev->iommufd, viommu->abort_hwpt_id); +free_viommu_core: + iommufd_backend_free_id(idev->iommufd, viommu->core->viommu_id); + g_free(viommu->core); +free_viommu: + g_free(viommu); + host_iommu_device_iommufd_attach_hwpt(idev, sdev->idev->ioas_id, errp); +free_s2_hwpt: + iommufd_backend_free_id(idev->iommufd, s2_hwpt_id); + return false; +} + +static bool smmu_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + SMMUState *s = opaque; + SMMUPciBus *sbus = smmu_get_sbus(s, bus); + SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn); + + if (!s->nested) { + return true; + } + + if (sdev->idev) { + if (sdev->idev != idev) { + return false;//-EEXIST; + } else { + return true; + } + } + + if (!idev) { + return true; + } + + if (!smmu_dev_attach_viommu(sdev, idev, errp)) { + error_report("Unable to attach viommu"); + return false; + } + + sdev->idev = idev; + sdev->viommu = s->viommu; + QLIST_INSERT_HEAD(&s->viommu->device_list, sdev, next); + trace_smmu_set_iommu_device(devfn, smmu_get_sid(sdev)); + + return true; +} + +static void smmu_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + SMMUVdev *vdev; + SMMUDevice *sdev; + SMMUViommu *viommu; + SMMUState *s = opaque; + SMMUPciBus *sbus = g_hash_table_lookup(s->smmu_pcibus_by_busptr, bus); + + if (!s->nested) { + return; + } + + if (!sbus) { + return; + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + return; + } + + if (!host_iommu_device_iommufd_attach_hwpt(sdev->idev, + sdev->idev->ioas_id, NULL)) { + error_report("Unable to attach dev to the default HW pagetable"); + } + + vdev = sdev->vdev; + viommu = sdev->viommu; + + sdev->idev = NULL; + sdev->viommu = NULL; + sdev->vdev = NULL; + QLIST_REMOVE(sdev, next); + trace_smmu_unset_iommu_device(devfn, smmu_get_sid(sdev)); + + if (vdev) { + iommufd_backend_free_id(viommu->iommufd, vdev->core->vdev_id); + g_free(vdev->core); + g_free(vdev); + } + + if (QLIST_EMPTY(&viommu->device_list)) { + iommufd_backend_free_id(viommu->iommufd, viommu->bypass_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->abort_hwpt_id); + iommufd_backend_free_id(viommu->iommufd, viommu->core->viommu_id); + g_free(viommu->core); + iommufd_backend_free_id(viommu->iommufd, viommu->s2_hwpt->hwpt_id); + g_free(viommu->s2_hwpt); + g_free(viommu); + s->viommu = NULL; + } +} + +static bool smmu_dev_get_pasid_cap(PCIBus *bus, + void *opaque, int devfn) +{ + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + + return true; } static const PCIIOMMUOps smmu_ops = { .get_address_space = smmu_find_add_as, + .set_iommu_device = smmu_dev_set_iommu_device, + .unset_iommu_device = smmu_dev_unset_iommu_device, + .get_pasid_cap = smmu_dev_get_pasid_cap, }; -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid) +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid) { uint8_t bus_n, devfn; SMMUPciBus *smmu_bus; - SMMUDevice *smmu; bus_n = PCI_BUS_NUM(sid); smmu_bus = smmu_find_smmu_pcibus(s, bus_n); if (smmu_bus) { devfn = SMMU_PCI_DEVFN(sid); - smmu = smmu_bus->pbdev[devfn]; - if (smmu) { - return &smmu->iommu; - } + return smmu_bus->pbdev[devfn]; } return NULL; } +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, uint8_t *pasid, void *data) +{ + uint64_t caps; + + if (!sdev || !sdev->idev) { + return -ENOENT; + } + + return !iommufd_backend_get_device_info(sdev->idev->iommufd, + sdev->idev->devid, data_type, data, + data_len, &caps, pasid, NULL); +} + +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort) +{ + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + uint32_t hwpt_id; + + if (!s1_hwpt || !sdev->viommu) { + return; + } + + if (abort) { + hwpt_id = sdev->viommu->abort_hwpt_id; + } else { + hwpt_id = sdev->viommu->bypass_hwpt_id; + } + + /* ToDo: May be better to move the below to smmuv3. */ + if (s1_hwpt->out_fault_fd) { + struct io_uring *ring = &s1_hwpt->fault_ring; + struct io_uring_sqe *sqe; + struct __kernel_timespec ts = {.tv_sec = 0, .tv_nsec = 1}; + + s1_hwpt->exiting = true; + /* Send out a timeout sqe for the read handler to exit */ + sqe = io_uring_get_sqe(ring); + io_uring_prep_timeout(sqe, &ts, 0, 0); + io_uring_submit(ring); + + qemu_cond_signal(&s1_hwpt->fault_cond); + qemu_thread_join(&s1_hwpt->read_fault_thread); + qemu_thread_join(&s1_hwpt->write_fault_thread); + qemu_mutex_destroy(&s1_hwpt->fault_mutex); + io_uring_queue_exit(&s1_hwpt->fault_ring); + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, NULL)) { + return; + } + + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); +} + +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data, + bool req_fault_fd) +{ + SMMUViommu *viommu = sdev->viommu; + SMMUS1Hwpt *s1_hwpt = sdev->s1_hwpt; + HostIOMMUDeviceIOMMUFD *idev = sdev->idev; + uint32_t flags = 0; + + if (!idev || !viommu) { + return -ENOENT; + } + + if (s1_hwpt) { + smmu_dev_uninstall_nested_ste(sdev, false); + } + + s1_hwpt = g_new0(SMMUS1Hwpt, 1); + if (!s1_hwpt) { + return -ENOMEM; + } + + s1_hwpt->smmu = sdev->smmu; + s1_hwpt->sdev = sdev; + s1_hwpt->viommu = viommu; + s1_hwpt->iommufd = idev->iommufd; + + if (req_fault_fd) { + flags |= IOMMU_HWPT_FAULT_ID_VALID; + } + + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid, + viommu->core->viommu_id, flags, data_type, + data_len, data, &s1_hwpt->hwpt_id, + &s1_hwpt->out_fault_fd, NULL)) { + goto free; + } + + if (!host_iommu_device_iommufd_attach_hwpt(idev, s1_hwpt->hwpt_id, NULL)) { + goto free_hwpt; + } + + sdev->s1_hwpt = s1_hwpt; + + return 0; +free_hwpt: + iommufd_backend_free_id(idev->iommufd, s1_hwpt->hwpt_id); +free: + sdev->s1_hwpt = NULL; + g_free(s1_hwpt); + + return -EINVAL; +} + +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs) +{ + if (!s1_hwpt) { + return -ENOENT; + } + + return iommufd_backend_invalidate_cache(s1_hwpt->iommufd, s1_hwpt->hwpt_id, + type, len, num, reqs); +} + +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs) +{ + if (!viommu) { + return -ENOENT; + } + + return iommufd_viommu_invalidate_cache(viommu->iommufd, viommu->viommu_id, + type, len, num, reqs); +} + /* Unmap all notifiers attached to @mr */ static void smmu_inv_notifiers_mr(IOMMUMemoryRegion *mr) { @@ -664,6 +1039,14 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) g_free, g_free); s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL); + if (s->nested) { + memory_region_init(&s->root, OBJECT(s), "root", UINT64_MAX); + memory_region_init_alias(&s->sysmem, OBJECT(s), + "smmu-sysmem", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_add_subregion(&s->root, 0, &s->sysmem); + } + if (s->primary_bus) { pci_setup_iommu(s->primary_bus, &smmu_ops, s); } else { @@ -683,6 +1066,7 @@ static Property smmu_dev_properties[] = { DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0), DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, TYPE_PCI_BUS, PCIBus *), + DEFINE_PROP_BOOL("nested", SMMUState, nested, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/arm/smmu-internal.h b/hw/arm/smmu-internal.h index 843bebb185d77f44681033db54dd0c5196c7459d..5a81dd1b82af4814710f5aa9fde2907062749940 100644 --- a/hw/arm/smmu-internal.h +++ b/hw/arm/smmu-internal.h @@ -142,6 +142,7 @@ typedef struct SMMUIOTLBPageInvInfo { } SMMUIOTLBPageInvInfo; typedef struct SMMUSIDRange { + SMMUState *state; uint32_t start; uint32_t end; } SMMUSIDRange; diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h index 6076025ad6a0f9031c5c03f06dc13a7ad1b6cd7b..cfc04c563e0229f6bc3f1ee33f3d942827172551 100644 --- a/hw/arm/smmuv3-internal.h +++ b/hw/arm/smmuv3-internal.h @@ -74,6 +74,7 @@ REG32(IDR1, 0x4) FIELD(IDR1, ECMDQ, 31, 1) #define SMMU_IDR1_SIDSIZE 16 +#define SMMU_IDR1_SSIDSIZE 16 #define SMMU_CMDQS 19 #define SMMU_EVENTQS 19 @@ -104,7 +105,7 @@ REG32(IDR5, 0x14) FIELD(IDR5, VAX, 10, 2); FIELD(IDR5, STALL_MAX, 16, 16); -#define SMMU_IDR5_OAS 4 +#define SMMU_IDR5_OAS 5 REG32(IIDR, 0x18) REG32(AIDR, 0x1c) @@ -226,6 +227,19 @@ static inline bool smmuv3_gerror_irq_enabled(SMMUv3State *s) #define Q_CONS_WRAP(q) (((q)->cons & WRAP_MASK(q)) >> (q)->log2size) #define Q_PROD_WRAP(q) (((q)->prod & WRAP_MASK(q)) >> (q)->log2size) +#define Q_IDX(llq, p) ((p) & ((1 << (llq)->max_n_shift) - 1)) + +static inline int smmuv3_q_ncmds(SMMUQueue *q) +{ + uint32_t prod = Q_PROD(q); + uint32_t cons = Q_CONS(q); + + if (Q_PROD_WRAP(q) == Q_CONS_WRAP(q)) + return prod - cons; + else + return WRAP_MASK(q) - cons + prod; +} + static inline bool smmuv3_q_full(SMMUQueue *q) { return ((q->cons ^ q->prod) & WRAP_INDEX_MASK(q)) == WRAP_MASK(q); @@ -552,6 +566,7 @@ typedef struct CD { #define STE_S1FMT(x) extract32((x)->word[0], 4 , 2) #define STE_S1CDMAX(x) extract32((x)->word[1], 27, 5) +#define STE_S1DSS(x) extract32((x)->word[2], 0, 2) #define STE_S1STALLD(x) extract32((x)->word[2], 27, 1) #define STE_EATS(x) extract32((x)->word[2], 28, 2) #define STE_STRW(x) extract32((x)->word[2], 30, 2) diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index c3871ae067ffb74a315025a8f54513e6e81b2a11..c0fcdd75746e1df006365a9091cfd1bc4bc6a320 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -24,6 +24,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-core.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bridge.h" #include "cpu.h" #include "trace.h" #include "qemu/log.h" @@ -33,6 +34,9 @@ #include "hw/arm/smmuv3.h" #include "smmuv3-internal.h" #include "smmu-internal.h" +#ifdef CONFIG_LINUX_IO_URING +#include +#endif #define PTW_RECORD_FAULT(cfg) (((cfg)->stage == 1) ? (cfg)->record_faults : \ (cfg)->s2cfg.record_faults) @@ -254,6 +258,81 @@ void smmuv3_record_event(SMMUv3State *s, SMMUEventInfo *info) info->recorded = true; } +static void smmuv3_nested_init_regs(SMMUv3State *s) +{ + SMMUState *bs = ARM_SMMU(s); + SMMUDevice *sdev; + uint32_t data_type; + uint32_t val; + uint8_t pasid; + int ret; + + if (!bs->nested || !bs->viommu) { + return; + } + + sdev = QLIST_FIRST(&bs->viommu->device_list); + if (!sdev) { + return; + } + + if (sdev->info.idr[0]) { + error_report("reusing the previous hw_info"); + goto out; + } + + ret = smmu_dev_get_info(sdev, &data_type, sizeof(sdev->info), &pasid, + &sdev->info); + if (ret) { + error_report("failed to get SMMU device info"); + return; + } + + if (data_type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + error_report( "Wrong data type (%d)!", data_type); + return; + } + +out: + trace_smmuv3_get_device_info(sdev->info.idr[0], sdev->info.idr[1], + sdev->info.idr[3], sdev->info.idr[5]); + + val = FIELD_EX32(sdev->info.idr[0], IDR0, BTM); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, BTM, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ATS); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ATS, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, ASID16); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, TERM_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STALL_MODEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, val); + val = FIELD_EX32(sdev->info.idr[0], IDR0, STLEVEL); + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, val); + + val = FIELD_EX32(sdev->info.idr[1], IDR1, SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, val); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, pasid); + + val = FIELD_EX32(sdev->info.idr[3], IDR3, HAD); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, HAD, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, RIL); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, val); + val = FIELD_EX32(sdev->info.idr[3], IDR3, BBML); + s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, val); + + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN4K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN16K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, GRAN64K); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, val); + val = FIELD_EX32(sdev->info.idr[5], IDR5, OAS); + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, val); + + /* FIXME check iidr and aidr registrs too */ +} + static void smmuv3_init_regs(SMMUv3State *s) { /* Based on sys property, the stages supported in smmu will be advertised.*/ @@ -268,13 +347,14 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[0] = FIELD_DP32(s->idr[0], IDR0, ASID16, 1); /* 16-bit ASID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, VMID16, 1); /* 16-bit VMID */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TTENDIAN, 2); /* little endian */ - s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 1); /* No stall */ + s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STALL_MODEL, 0); /* stall */ /* terminated transaction will always be aborted/error returned */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, TERM_MODEL, 1); /* 2-level stream table supported */ s->idr[0] = FIELD_DP32(s->idr[0], IDR0, STLEVEL, 1); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SIDSIZE, SMMU_IDR1_SIDSIZE); + s->idr[1] = FIELD_DP32(s->idr[1], IDR1, SSIDSIZE, SMMU_IDR1_SSIDSIZE); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, EVENTQS, SMMU_EVENTQS); s->idr[1] = FIELD_DP32(s->idr[1], IDR1, CMDQS, SMMU_CMDQS); @@ -286,12 +366,15 @@ static void smmuv3_init_regs(SMMUv3State *s) s->idr[3] = FIELD_DP32(s->idr[3], IDR3, RIL, 1); s->idr[3] = FIELD_DP32(s->idr[3], IDR3, BBML, 2); - s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 44 bits */ + s->idr[5] = FIELD_DP32(s->idr[5], IDR5, OAS, SMMU_IDR5_OAS); /* 48 bits */ /* 4K, 16K and 64K granule support */ s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN4K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN16K, 1); s->idr[5] = FIELD_DP32(s->idr[5], IDR5, GRAN64K, 1); + /* Override IDR fields with HW caps */ + smmuv3_nested_init_regs(s); + s->cmdq.base = deposit64(s->cmdq.base, 0, 5, SMMU_CMDQS); s->cmdq.prod = 0; s->cmdq.cons = 0; @@ -486,6 +569,27 @@ bad_ste: return -EINVAL; } +static void decode_ste_config(SMMUTransCfg *cfg, uint32_t config) +{ + + if (STE_CFG_ABORT(config)) { + cfg->aborted = true; + return; + } + if (STE_CFG_BYPASS(config)) { + cfg->bypassed = true; + return; + } + + if (STE_CFG_S1_ENABLED(config)) { + cfg->stage = SMMU_STAGE_1; + } + + if (STE_CFG_S2_ENABLED(config)) { + cfg->stage |= SMMU_STAGE_2; + } +} + /* Returns < 0 in case of invalid STE, 0 otherwise */ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, STE *ste, SMMUEventInfo *event) @@ -502,12 +606,19 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, config = STE_CONFIG(ste); - if (STE_CFG_ABORT(config)) { + decode_ste_config(cfg, config); + + /* S1DSS.Terminate is same as Config.abort for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0) { cfg->aborted = true; + } + + if (cfg->aborted || cfg->bypassed) { return 0; } - if (STE_CFG_BYPASS(config)) { + /* S1DSS.Bypass is same as Config.bypass for default stream */ + if (STE_CFG_S1_ENABLED(config) && STE_S1DSS(ste) == 0x1) { cfg->bypassed = true; return 0; } @@ -545,13 +656,14 @@ static int decode_ste(SMMUv3State *s, SMMUTransCfg *cfg, } } - if (STE_S1CDMAX(ste) != 0) { + if (!FIELD_EX32(s->idr[1], IDR1, SSIDSIZE) && STE_S1CDMAX(ste) != 0) { qemu_log_mask(LOG_UNIMP, "SMMUv3 does not support multiple context descriptors yet\n"); goto bad_ste; } - if (STE_S1STALLD(ste)) { + /* STALL_MODEL being 0b01 means "stall is not supported" */ + if ((FIELD_EX32(s->idr[0], IDR0, STALL_MODEL) & 0x1) && STE_S1STALLD(ste)) { qemu_log_mask(LOG_UNIMP, "SMMUv3 S1 stalling fault model not allowed yet\n"); goto bad_ste; @@ -669,9 +781,6 @@ static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event) if (!CD_A(cd)) { goto bad_cd; /* SMMU_IDR0.TERM_MODEL == 1 */ } - if (CD_S(cd)) { - goto bad_cd; /* !STE_SECURE && SMMU_IDR0.STALL_MODEL == 1 */ - } if (CD_HA(cd) || CD_HD(cd)) { goto bad_cd; /* HTTU = 0 */ } @@ -1153,30 +1262,352 @@ static void smmuv3_range_inval(SMMUState *s, Cmd *cmd) } } +static void smmuv3_report_iommu_fault(SMMUS1Hwpt *hwpt, + struct iommu_hwpt_pgfault *fault) +{ + PendFaultEntry *pend; + SMMUDevice *sdev = hwpt->sdev; + SMMUv3State *s3 = sdev->smmu; + uint32_t sid = smmu_get_sid(sdev); + SMMUEventInfo info = {0}; + + info.sid = sid; + info.type = SMMU_EVT_F_TRANSLATION; + info.u.f_translation.addr = fault->addr; + info.u.f_translation.stall = true; + info.u.f_translation.ssid = fault->pasid; + info.u.f_translation.stag = fault->grpid; + + if (fault->flags | IOMMU_PGFAULT_FLAGS_PASID_VALID) { + info.u.f_translation.ssv = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_READ) { + info.u.f_translation.rnw = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_PRIV) { + info.u.f_translation.pnu = true; + } + if (fault->perm & IOMMU_PGFAULT_PERM_EXEC) { + info.u.f_translation.ind = true; + } + + pend = g_new0(PendFaultEntry, 1); + memcpy(&pend->fault, fault, sizeof(*fault)); + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_INSERT_TAIL(&hwpt->pendfault, pend, entry); + qemu_mutex_unlock(&hwpt->fault_mutex); + smmuv3_record_event(s3, &info); + return; +} + +static void smmuv3_notify_stall_resume(SMMUState *bs, uint32_t sid, + uint32_t stag, uint32_t code) +{ + SMMUDevice *sdev = smmu_find_sdev(bs, sid); + PageRespEntry *msg; + PendFaultEntry *pend, *tmp; + SMMUS1Hwpt *hwpt; + bool found = false; + + if (!sdev) { + return; + } + + hwpt = sdev->s1_hwpt; + msg = g_new0(PageRespEntry, 1); + + /* Kernel expects addr and pasid info for page response */ + qemu_mutex_lock(&hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(pend, &hwpt->pendfault, entry, tmp) { + if (pend->fault.grpid == stag) { + QTAILQ_REMOVE(&hwpt->pendfault, pend, entry); + msg->resp.cookie = pend->fault.cookie; + msg->resp.code = code; + QTAILQ_INSERT_TAIL(&hwpt->pageresp, msg, entry); + qemu_cond_signal(&hwpt->fault_cond); + + g_free(pend); + found = true; + break; + } + } + + qemu_mutex_unlock(&hwpt->fault_mutex); + if (!found) { + warn_report("No matching fault for resume(stag 0x%x), drop!", stag); + return; + } +} + +static void *write_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + PageRespEntry *msg, *tmp; + struct iommu_hwpt_page_response *resp; + int ret; + + resp = g_new0(struct iommu_hwpt_page_response, 1); + while (!hwpt->exiting) { + /* Check we have any pending responses */ + qemu_mutex_lock(&hwpt->fault_mutex); + qemu_cond_wait(&hwpt->fault_cond, &hwpt->fault_mutex); + QTAILQ_FOREACH_SAFE(msg, &hwpt->pageresp, entry, tmp) { + QTAILQ_REMOVE(&hwpt->pageresp, msg, entry); + memcpy(resp, &msg->resp, sizeof(*resp)); + g_free(msg); + + ret = write(hwpt->out_fault_fd, resp, sizeof(*resp)); + if (ret != sizeof(*resp)) { + warn_report("Write resp[cookie 0x%x] fail %d", + resp->cookie, ret); + } + } + qemu_mutex_unlock(&hwpt->fault_mutex); + } + g_free(resp); + return NULL; +} + +static void *read_fault_handler(void *opaque) +{ + SMMUS1Hwpt *hwpt = opaque; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct iommu_hwpt_pgfault *fault; + struct io_uring *ring = &hwpt->fault_ring; + void *data; + int ret; + + fault = g_new0(struct iommu_hwpt_pgfault, 1); + while (!hwpt->exiting) { + sqe = io_uring_get_sqe(ring); + io_uring_prep_read(sqe, hwpt->out_fault_fd, fault, + sizeof(*fault), 0); + io_uring_sqe_set_data(sqe, fault); + io_uring_submit(ring); + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret == 0) { + if (cqe->res == sizeof(*fault)) { + data = io_uring_cqe_get_data(cqe); + smmuv3_report_iommu_fault(hwpt, data); + } + } else { + warn_report("Read fault[hwpt_id 0x%x] failed %d", + hwpt->hwpt_id, ret); + } + io_uring_cqe_seen(ring, cqe); + } + g_free(fault); + return NULL; +} + +static void create_fault_handlers(SMMUS1Hwpt *hwpt) +{ + if (!hwpt->out_fault_fd) { + warn_report("No fault fd for hwpt id: %d", hwpt->hwpt_id); + return; + } + + io_uring_queue_init(1024, &hwpt->fault_ring, 0); + qemu_mutex_init(&hwpt->fault_mutex); + qemu_cond_init(&hwpt->fault_cond); + QTAILQ_INIT(&hwpt->pageresp); + QTAILQ_INIT(&hwpt->pendfault); + qemu_thread_create(&hwpt->read_fault_thread, "io fault read", + read_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); + qemu_thread_create(&hwpt->write_fault_thread, "io fault write", + write_fault_handler, + hwpt, QEMU_THREAD_JOINABLE); +} +static void smmuv3_install_nested_ste(SMMUDevice *sdev, int sid) +{ +#ifdef __linux__ + SMMUEventInfo event = {.type = SMMU_EVT_NONE, .sid = sid, + .inval_ste_allowed = true}; + struct iommu_hwpt_arm_smmuv3 nested_data = {}; + SMMUv3State *s = sdev->smmu; + SMMUState *bs = &s->smmu_state; + bool req_fault_fd = false; + uint32_t config; + STE ste; + int ret; + + if (!sdev->viommu || !bs->nested) { + return; + } + + if (!sdev->vdev && sdev->idev && sdev->viommu) { + SMMUVdev *vdev = g_new0(SMMUVdev, 1); + vdev->core = iommufd_backend_alloc_vdev(sdev->idev, sdev->viommu->core, + sid); + if (!vdev->core) { + error_report("failed to allocate a vDEVICE"); + g_free(vdev); + return; + } + sdev->vdev = vdev; + } + + ret = smmu_find_ste(sdev->smmu, sid, &ste, &event); + if (ret) { + /* + * For a 2-level Stream Table, the level-2 table might not be ready + * until the device gets inserted to the stream table. Ignore this. + */ + return; + } + + config = STE_CONFIG(&ste); + if (!STE_VALID(&ste) || !STE_CFG_S1_ENABLED(config)) { + smmu_dev_uninstall_nested_ste(sdev, STE_CFG_ABORT(config)); + smmuv3_flush_config(sdev); + return; + } + + nested_data.ste[0] = (uint64_t)ste.word[0] | (uint64_t)ste.word[1] << 32; + nested_data.ste[1] = (uint64_t)ste.word[2] | (uint64_t)ste.word[3] << 32; + /* V | CONFIG | S1FMT | S1CTXPTR | S1CDMAX */ + nested_data.ste[0] &= 0xf80fffffffffffffULL; + /* S1DSS | S1CIR | S1COR | S1CSH | S1STALLD | EATS */ + nested_data.ste[1] &= 0x380000ffULL; + + if (STE_S1CDMAX(&ste)) { + req_fault_fd = true; + } + + ret = smmu_dev_install_nested_ste(sdev, IOMMU_HWPT_DATA_ARM_SMMUV3, + sizeof(nested_data), &nested_data, + req_fault_fd); + if (ret) { + error_report("Unable to install nested STE=%16LX:%16LX, ret=%d", + nested_data.ste[1], nested_data.ste[0], ret); + } + + if (req_fault_fd) { + create_fault_handlers(sdev->s1_hwpt); + } + + trace_smmuv3_install_nested_ste(sid, nested_data.ste[1], nested_data.ste[0]); +#endif +} + static gboolean -smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +_smmuv3_invalidate_ste(SMMUDevice *sdev, SMMUSIDRange *sid_range) { - SMMUDevice *sdev = (SMMUDevice *)key; uint32_t sid = smmu_get_sid(sdev); - SMMUSIDRange *sid_range = (SMMUSIDRange *)user_data; if (sid < sid_range->start || sid > sid_range->end) { return false; } + smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); trace_smmuv3_config_cache_inv(sid); return true; } +static gboolean +smmuv3_invalidate_ste(gpointer key, gpointer value, gpointer user_data) +{ + return _smmuv3_invalidate_ste((SMMUDevice *)key, (SMMUSIDRange *)user_data); +} + +static void smmuv3_invalidate_nested_ste(SMMUSIDRange *sid_range) +{ + SMMUState *bs = sid_range->state; + SMMUDevice *sdev; + + if (!bs->viommu) { + return; + } + + QLIST_FOREACH(sdev, &bs->viommu->device_list, next) { + if (smmu_get_sid(sdev)) { + _smmuv3_invalidate_ste(sdev, sid_range); + } + } +} + +/** + * SMMUCommandBatch - batch of commands to issue for nested SMMU invalidation + * @cmds: Pointer to list of commands + * @cons: Pointer to list of CONS corresponding to the commands + * @ncmds: Total ncmds in the batch + * @dev_cache: Issue to a device cache + */ +typedef struct SMMUCommandBatch { + Cmd *cmds; + uint32_t *cons; + uint32_t ncmds; + bool dev_cache; +} SMMUCommandBatch; + +/* Update batch->ncmds to the number of execute cmds */ +static int smmuv3_issue_cmd_batch(SMMUState *bs, SMMUCommandBatch *batch) +{ + uint32_t total = batch->ncmds; + int ret; + + ret = smmu_viommu_invalidate_cache(bs->viommu->core, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3, + sizeof(Cmd), &batch->ncmds, batch->cmds); + if (total != batch->ncmds) { + error_report("%s failed: ret=%d, total=%d, done=%d", + __func__, ret, total, batch->ncmds); + return ret; + } + + batch->ncmds = 0; + batch->dev_cache = false; + return ret; +} + +static int smmuv3_batch_cmds(SMMUState *bs, SMMUCommandBatch *batch, + Cmd *cmd, uint32_t *cons, bool dev_cache) +{ + int ret; + + if (!bs->nested || !bs->viommu) { + return 0; + } + + /* + * Currently separate dev_cache and hwpt for safety, which might not be + * necessary if underlying HW SMMU does not have the errata. + * + * TODO check IIDR register values read from hw_info. + */ + if (batch->ncmds && (dev_cache != batch->dev_cache)) { + ret = smmuv3_issue_cmd_batch(bs, batch); + if (ret) { + *cons = batch->cons[batch->ncmds]; + return ret; + } + } + batch->dev_cache = dev_cache; + batch->cmds[batch->ncmds] = *cmd; + batch->cons[batch->ncmds++] = *cons; + return 0; +} + static int smmuv3_cmdq_consume(SMMUv3State *s) { SMMUState *bs = ARM_SMMU(s); SMMUCmdError cmd_error = SMMU_CERROR_NONE; SMMUQueue *q = &s->cmdq; SMMUCommandType type = 0; + SMMUCommandBatch batch = {}; + uint32_t ncmds = 0; if (!smmuv3_cmdq_enabled(s)) { return 0; } + + ncmds = smmuv3_q_ncmds(q); + batch.cmds = g_new0(Cmd, ncmds); + batch.cons = g_new0(uint32_t, ncmds); + /* * some commands depend on register values, typically CR0. In case those * register values change while handling the command, spec says it @@ -1217,21 +1648,20 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_CFGI_STE: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_ste(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); + smmuv3_install_nested_ste(sdev, sid); break; } @@ -1247,33 +1677,40 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) } mask = (1ULL << (range + 1)) - 1; + sid_range.state = bs; sid_range.start = sid & ~mask; sid_range.end = sid_range.start + mask; trace_smmuv3_cmdq_cfgi_ste_range(sid_range.start, sid_range.end); g_hash_table_foreach_remove(bs->configs, smmuv3_invalidate_ste, &sid_range); + smmuv3_invalidate_nested_ste(&sid_range); break; } case SMMU_CMD_CFGI_CD: case SMMU_CMD_CFGI_CD_ALL: { uint32_t sid = CMD_SID(&cmd); - IOMMUMemoryRegion *mr = smmu_iommu_mr(bs, sid); - SMMUDevice *sdev; + SMMUDevice *sdev = smmu_find_sdev(bs, sid); if (CMD_SSEC(&cmd)) { cmd_error = SMMU_CERROR_ILL; break; } - if (!mr) { + if (!sdev) { break; } trace_smmuv3_cmdq_cfgi_cd(sid); - sdev = container_of(mr, SMMUDevice, iommu); smmuv3_flush_config(sdev); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } break; } case SMMU_CMD_TLBI_NH_ASID: @@ -1288,6 +1725,10 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh_asid(asid); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_asid(bs, asid); + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; } case SMMU_CMD_TLBI_NH_ALL: @@ -1300,6 +1741,11 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) trace_smmuv3_cmdq_tlbi_nh(); smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_all(bs); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; case SMMU_CMD_TLBI_NH_VAA: case SMMU_CMD_TLBI_NH_VA: @@ -1308,7 +1754,24 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) break; } smmuv3_range_inval(bs, &cmd); + + if (smmuv3_batch_cmds(bs, &batch, &cmd, &q->cons, false)) { + cmd_error = SMMU_CERROR_ILL; + break; + } break; + case SMMU_CMD_ATC_INV: + { + SMMUDevice *sdev = smmu_find_sdev(bs, CMD_SID(&cmd)); + + if (sdev->s1_hwpt) { + if (smmuv3_batch_cmds(sdev->smmu, &batch, &cmd, &q->cons, true)) { + cmd_error = SMMU_CERROR_ILL; + break; + } + } + break; + } case SMMU_CMD_TLBI_S12_VMALL: { uint16_t vmid = CMD_VMID(&cmd); @@ -1340,12 +1803,23 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) case SMMU_CMD_TLBI_EL2_ASID: case SMMU_CMD_TLBI_EL2_VA: case SMMU_CMD_TLBI_EL2_VAA: - case SMMU_CMD_ATC_INV: case SMMU_CMD_PRI_RESP: - case SMMU_CMD_RESUME: case SMMU_CMD_STALL_TERM: trace_smmuv3_unhandled_cmd(type); break; + case SMMU_CMD_RESUME: + { + uint32_t sid = CMD_SID(&cmd); + uint16_t stag = CMD_RESUME_STAG(&cmd); + uint8_t action = CMD_RESUME_AC(&cmd); + uint32_t code = IOMMUFD_PAGE_RESP_INVALID; + + if (action) { + code = IOMMUFD_PAGE_RESP_SUCCESS; + } + smmuv3_notify_stall_resume(bs, sid, stag, code); + break; + } default: cmd_error = SMMU_CERROR_ILL; break; @@ -1365,12 +1839,22 @@ static int smmuv3_cmdq_consume(SMMUv3State *s) */ queue_cons_incr(q); } + qemu_mutex_lock(&s->mutex); + if (!cmd_error && batch.ncmds && bs->viommu) { + if (smmuv3_issue_cmd_batch(bs, &batch)) { + q->cons = batch.cons[batch.ncmds]; + cmd_error = SMMU_CERROR_ILL; + } + } + qemu_mutex_unlock(&s->mutex); if (cmd_error) { trace_smmuv3_cmdq_consume_error(smmu_cmd_string(type), cmd_error); smmu_write_cmdq_err(s, cmd_error); smmuv3_trigger_irq(s, SMMU_IRQ_GERROR, R_GERROR_CMDQ_ERR_MASK); } + g_free(batch.cmds); + g_free(batch.cons); trace_smmuv3_cmdq_consume_out(Q_PROD(q), Q_CONS(q), Q_PROD_WRAP(q), Q_CONS_WRAP(q)); @@ -1746,6 +2230,11 @@ static void smmu_realize(DeviceState *d, Error **errp) SysBusDevice *dev = SYS_BUS_DEVICE(d); Error *local_err = NULL; + if (s->stage && strcmp("1", s->stage)) { + /* Only support nested with an stage1 only vSMMU */ + sys->nested = false; + } + c->parent_realize(d, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -1764,6 +2253,38 @@ static void smmu_realize(DeviceState *d, Error **errp) smmu_init_irq(s, dev); } +static int smmuv3_accel_pci_host_bridge(Object *obj, void *opaque) +{ + DeviceState *d = opaque; + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + if (d->parent_bus && !strcmp(bus->qbus.name, d->parent_bus->name)) { + object_property_set_link(OBJECT(d), "primary-bus", OBJECT(bus), + &error_abort); + } + } + return 0; +} + +static void smmu_accel_realize(DeviceState *d, Error **errp) +{ + SMMUv3AccelState *s_nested = ARM_SMMUV3_ACCEL(d); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_nested); + SysBusDevice *dev = SYS_BUS_DEVICE(d); + Error *local_err = NULL; + + object_child_foreach_recursive(object_get_root(), + smmuv3_accel_pci_host_bridge, d); + object_property_set_bool(OBJECT(dev), "nested", true, &error_abort); + + c->parent_realize(d, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } +} + static const VMStateDescription vmstate_smmuv3_queue = { .name = "smmuv3_queue", .version_id = 1, @@ -1862,6 +2383,19 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmuv3_properties); } +static void smmuv3_accel_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass); + + dc->vmsd = &vmstate_smmuv3; + device_class_set_parent_realize(dc, smmu_accel_realize, + &c->parent_realize); + dc->user_creatable = true; + dc->hotpluggable = false; + dc->bus_type = TYPE_PCIE_BUS; +} + static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, IOMMUNotifierFlag old, IOMMUNotifierFlag new, @@ -1876,12 +2410,9 @@ static int smmuv3_notify_flag_changed(IOMMUMemoryRegion *iommu, return -EINVAL; } - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu MAP notifier which is " - "not currently supported", pci_bus_num(sdev->bus), - PCI_SLOT(sdev->devfn), PCI_FUNC(sdev->devfn)); - return -EINVAL; + /* nested-smmuv3 does not need IOMMU_NOTIFIER_MAP. Ignore it. */ + if (s->nested) { + new &= ~IOMMU_NOTIFIER_MAP; } if (old == IOMMU_NOTIFIER_NONE) { @@ -1903,6 +2434,14 @@ static void smmuv3_iommu_memory_region_class_init(ObjectClass *klass, imrc->notify_flag_changed = smmuv3_notify_flag_changed; } +static const TypeInfo smmuv3_accel_type_info = { + .name = TYPE_ARM_SMMUV3_ACCEL, + .parent = TYPE_ARM_SMMUV3, + .instance_size = sizeof(SMMUv3AccelState), + .class_size = sizeof(SMMUv3AccelClass), + .class_init = smmuv3_accel_class_init, +}; + static const TypeInfo smmuv3_type_info = { .name = TYPE_ARM_SMMUV3, .parent = TYPE_ARM_SMMU, @@ -1921,6 +2460,7 @@ static const TypeInfo smmuv3_iommu_memory_region_info = { static void smmuv3_register_types(void) { type_register(&smmuv3_type_info); + type_register(&smmuv3_accel_type_info); type_register(&smmuv3_iommu_memory_region_info); } diff --git a/hw/arm/trace-events b/hw/arm/trace-events index cdc1ea06a81c560246fb94cabbeb4e0ab26ed3c4..490da6349cf49193bed9e49d1687eb5fb9de9561 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -5,6 +5,8 @@ virt_acpi_setup(void) "No fw cfg or ACPI disabled. Bailing out." # smmu-common.c smmu_add_mr(const char *name) "%s" +smmu_set_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" +smmu_unset_iommu_device(int devfn, uint32_t sid) "devfn=%d (sid=%d)" smmu_ptw_level(int stage, int level, uint64_t iova, size_t subpage_size, uint64_t baseaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d iova=0x%"PRIx64" subpage_sz=0x%zx baseaddr=0x%"PRIx64" offset=%d => pte=0x%"PRIx64 smmu_ptw_invalid_pte(int stage, int level, uint64_t baseaddr, uint64_t pteaddr, uint32_t offset, uint64_t pte) "stage=%d level=%d base@=0x%"PRIx64" pte@=0x%"PRIx64" offset=%d pte=0x%"PRIx64 smmu_ptw_page_pte(int stage, int level, uint64_t iova, uint64_t baseaddr, uint64_t pteaddr, uint64_t pte, uint64_t address) "stage=%d level=%d iova=0x%"PRIx64" base@=0x%"PRIx64" pte@=0x%"PRIx64" pte=0x%"PRIx64" page address = 0x%"PRIx64 @@ -53,5 +55,7 @@ smmuv3_cmdq_tlbi_s12_vmid(uint16_t vmid) "vmid=%d" smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" +smmuv3_get_device_info(uint32_t idr0, uint32_t idr1, uint32_t idr3, uint32_t idr5) "idr0=0x%x idr1=0x%x idr3=0x%x idr5=0x%x" smmuv3_inv_notifiers_iova(const char *name, uint16_t asid, uint16_t vmid, uint64_t iova, uint8_t tg, uint64_t num_pages) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64 +smmuv3_install_nested_ste(uint32_t sid, uint64_t ste_1, uint64_t ste_0) "sid=%d ste=%"PRIx64":%"PRIx64 diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 8bc35a483c9233f4b718bd3a66c5ec807d048b1b..5e949671a1f67dabe82d8a94b40bfd6f868ddda9 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -58,11 +58,255 @@ #include "migration/vmstate.h" #include "hw/acpi/ghes.h" #include "hw/acpi/viot.h" +#include "kvm_arm.h" #define ARM_SPI_BASE 32 #define ACPI_BUILD_TABLE_SIZE 0x20000 +/* + * ACPI spec, Revision 6.3 + * 5.2.29.2 Cache Type Structure (Type 1) + */ +static void build_cache_hierarchy_node(GArray *tbl, uint32_t next_level, + uint32_t cache_type) +{ + build_append_byte(tbl, 1); + build_append_byte(tbl, 24); + build_append_int_noprefix(tbl, 0, 2); + build_append_int_noprefix(tbl, 127, 4); + build_append_int_noprefix(tbl, next_level, 4); + + switch (cache_type) { + case ARM_L1D_CACHE: /* L1 dcache info */ + build_append_int_noprefix(tbl, ARM_L1DCACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1DCACHE_SETS, 4); + build_append_byte(tbl, ARM_L1DCACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1DCACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1DCACHE_LINE_SIZE, 2); + break; + case ARM_L1I_CACHE: /* L1 icache info */ + build_append_int_noprefix(tbl, ARM_L1ICACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1ICACHE_SETS, 4); + build_append_byte(tbl, ARM_L1ICACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1ICACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1ICACHE_LINE_SIZE, 2); + break; + case ARM_L1_CACHE: /* L1 cache info */ + build_append_int_noprefix(tbl, ARM_L1CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L1CACHE_SETS, 4); + build_append_byte(tbl, ARM_L1CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L1CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L1CACHE_LINE_SIZE, 2); + break; + case ARM_L2_CACHE: /* L2 cache info */ + build_append_int_noprefix(tbl, ARM_L2CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L2CACHE_SETS, 4); + build_append_byte(tbl, ARM_L2CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L2CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L2CACHE_LINE_SIZE, 2); + break; + case ARM_L3_CACHE: /* L3 cache info */ + build_append_int_noprefix(tbl, ARM_L3CACHE_SIZE, 4); + build_append_int_noprefix(tbl, ARM_L3CACHE_SETS, 4); + build_append_byte(tbl, ARM_L3CACHE_ASSOCIATIVITY); + build_append_byte(tbl, ARM_L3CACHE_ATTRIBUTES); + build_append_int_noprefix(tbl, ARM_L3CACHE_LINE_SIZE, 2); + break; + default: + build_append_int_noprefix(tbl, 0, 4); + build_append_int_noprefix(tbl, 0, 4); + build_append_byte(tbl, 0); + build_append_byte(tbl, 0); + build_append_int_noprefix(tbl, 0, 2); + } +} + +/* + * ACPI spec, Revision 6.3 + * 5.2.29 Processor Properties Topology Table (PPTT) + */ +static void build_pptt_arm(GArray *table_data, BIOSLinker *linker, MachineState *ms, + const char *oem_id, const char *oem_table_id) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + GQueue *list = g_queue_new(); + guint pptt_start = table_data->len; + guint parent_offset; + guint length, i; + int uid = 0; + int socket; + AcpiTable table = { .sig = "PPTT", .rev = 2, + .oem_id = oem_id, .oem_table_id = oem_table_id }; + bool unified_l1 = cpu_l1_cache_unified(0); + + acpi_table_begin(&table, table_data); + + for (socket = 0; socket < ms->smp.sockets; socket++) { + uint32_t l3_cache_offset = table_data->len - pptt_start; + build_cache_hierarchy_node(table_data, 0, ARM_L3_CACHE); + + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + /* + * Physical package - represents the boundary + * of a physical package + */ + (1 << 0), + 0, socket, &l3_cache_offset, 1); + } + + if (mc->smp_props.clusters_supported) { + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int cluster; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (cluster = 0; cluster < ms->smp.clusters; cluster++) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, cluster, NULL, 0); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int core; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (core = 0; core < ms->smp.cores; core++) { + uint32_t priv_rsrc[3] = {}; + priv_rsrc[0] = table_data->len - pptt_start; /* L2 cache offset */ + build_cache_hierarchy_node(table_data, 0, ARM_L2_CACHE); + + if (unified_l1) { + priv_rsrc[1] = table_data->len - pptt_start; /* L1 cache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1_CACHE); + } else { + priv_rsrc[1] = table_data->len - pptt_start; /* L1 dcache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1D_CACHE); + priv_rsrc[2] = table_data->len - pptt_start; /* L1 icache offset */ + build_cache_hierarchy_node(table_data, priv_rsrc[0], ARM_L1I_CACHE); + } + + if (ms->smp.threads > 1) { + g_queue_push_tail(list, + GUINT_TO_POINTER(table_data->len - pptt_start)); + build_processor_hierarchy_node( + table_data, + (0 << 0), /* not a physical package */ + parent_offset, core, priv_rsrc, 3); + } else { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, priv_rsrc, 3); + } + } + } + + length = g_queue_get_length(list); + for (i = 0; i < length; i++) { + int thread; + + parent_offset = GPOINTER_TO_UINT(g_queue_pop_head(list)); + for (thread = 0; thread < ms->smp.threads; thread++) { + build_processor_hierarchy_node( + table_data, + (1 << 1) | /* ACPI Processor ID valid */ + (1 << 2) | /* Processor is a Thread */ + (1 << 3), /* Node is a Leaf */ + parent_offset, uid++, NULL, 0); + } + } + + g_queue_free(list); + acpi_table_end(linker, &table); +} + +static void acpi_dsdt_add_psd(Aml *dev, int cpus) +{ + Aml *pkg; + Aml *sub; + + sub = aml_package(5); + aml_append(sub, aml_int(5)); + aml_append(sub, aml_int(0)); + /* Assume all vCPUs belong to the same domain */ + aml_append(sub, aml_int(0)); + /* SW_ANY: OSPM coordinate, initiate on any processor */ + aml_append(sub, aml_int(0xFD)); + aml_append(sub, aml_int(cpus)); + + pkg = aml_package(1); + aml_append(pkg, sub); + + aml_append(dev, aml_name_decl("_PSD", pkg)); +} + +static void acpi_dsdt_add_cppc(Aml *dev, uint64_t cpu_base, int *regs_offset) +{ + Aml *cpc; + int i; + + /* Use version 3 of CPPC table from ACPI 6.3 */ + cpc = aml_package(23); + aml_append(cpc, aml_int(23)); + aml_append(cpc, aml_int(3)); + + for (i = 0; i < CPPC_REG_COUNT; i++) { + Aml *res; + uint8_t reg_width; + uint8_t acc_type; + uint64_t addr; + + if (regs_offset[i] == -1) { + reg_width = 0; + acc_type = AML_ANY_ACC; + addr = 0; + } else { + addr = cpu_base + regs_offset[i]; + if (i == REFERENCE_CTR || i == DELIVERED_CTR) { + reg_width = 64; + acc_type = AML_QWORD_ACC; + } else { + reg_width = 32; + acc_type = AML_DWORD_ACC; + } + } + + res = aml_resource_template(); + aml_append(res, aml_generic_register(AML_SYSTEM_MEMORY, reg_width, 0, + acc_type, addr)); + aml_append(cpc, res); + } + + aml_append(dev, aml_name_decl("_CPC", cpc)); +} + +static void virt_acpi_dsdt_cpu_cppc(int ncpu, int num_cpu, Aml *dev) { + VirtMachineState *vms = VIRT_MACHINE(qdev_get_machine()); + const MemMapEntry *cppc_memmap = &vms->memmap[VIRT_CPUFREQ]; + + /* + * Append _CPC and _PSD to support CPU frequence show + * Check CPPC available by DESIRED_PERF register + */ + if (cppc_regs_offset[DESIRED_PERF] != -1) { + acpi_dsdt_add_cppc(dev, + cppc_memmap->base + ncpu * CPPC_REG_PER_CPU_STRIDE, + cppc_regs_offset); + acpi_dsdt_add_psd(dev, num_cpu); + } +} + static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms) { MachineState *ms = MACHINE(vms); @@ -72,6 +316,9 @@ static void acpi_dsdt_add_cpus(Aml *scope, VirtMachineState *vms) Aml *dev = aml_device("C%.03X", i); aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0007"))); aml_append(dev, aml_name_decl("_UID", aml_int(i))); + + virt_acpi_dsdt_cpu_cppc(i, ms->smp.cpus, dev); + aml_append(scope, dev); } } @@ -159,6 +406,54 @@ static void acpi_dsdt_add_virtio(Aml *scope, } } +static void acpi_dsdt_add_hisi_sec(Aml *scope, + const MemMapEntry *virtio_mmio_memmap, + int dev_id) +{ + hwaddr size = 0x10000; + + /* + * Calculate the base address for the sec device node. + * Each device group contains one sec device and one hpre device,spaced by 2 * size. + */ + hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size; + + Aml *dev = aml_device("SE%02u", dev_id); + aml_append(dev, aml_name_decl("_HID", aml_string("SEC07"))); + aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + + aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} + +static void acpi_dsdt_add_hisi_hpre(Aml *scope, + const MemMapEntry *virtio_mmio_memmap, + int dev_id) +{ + hwaddr size = 0x10000; + + /* + * Calculate the base address for the hpre device node. + * Each hpre device follows the corresponding sec device by an additional offset of size. + */ + hwaddr base = virtio_mmio_memmap->base + dev_id * 2 * size + size; + + Aml *dev = aml_device("HP%02u", dev_id); + aml_append(dev, aml_name_decl("_HID", aml_string("HPRE07"))); + aml_append(dev, aml_name_decl("_UID", aml_int(dev_id))); + aml_append(dev, aml_name_decl("_CCA", aml_int(1))); + + Aml *crs = aml_resource_template(); + + aml_append(crs, aml_memory32_fixed(base, size, AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); +} + static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, uint32_t irq, VirtMachineState *vms) { @@ -171,6 +466,14 @@ static void acpi_dsdt_add_pci(Aml *scope, const MemMapEntry *memmap, .bus = vms->bus, }; + /* + * Accel SMMU requires RMRs for MSI 1-1 mapping, which + * require _DSM for PreservingPCI Boot Configurations + */ + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + cfg.preserve_config = true; + } + if (vms->highmem_mmio) { cfg.mmio64 = memmap[VIRT_HIGH_PCIE_MMIO]; } @@ -249,7 +552,7 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms) #define IORT_NODE_OFFSET 48 static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, - uint32_t id_count, uint32_t out_ref) + uint32_t id_count, uint32_t out_ref, uint32_t flags) { /* Table 4 ID mapping format */ build_append_int_noprefix(table_data, input_base, 4); /* Input base */ @@ -257,7 +560,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, build_append_int_noprefix(table_data, input_base, 4); /* Output base */ build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */ /* Flags */ - build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4); + build_append_int_noprefix(table_data, flags, 4); /* Flags */ } struct AcpiIortIdMapping { @@ -299,6 +602,50 @@ static int iort_idmap_compare(gconstpointer a, gconstpointer b) return idmap_a->input_base - idmap_b->input_base; } +static void +build_iort_rmr_nodes(GArray *table_data, GArray *smmu_idmaps, + size_t *smmu_offset, uint32_t *id) +{ + AcpiIortIdMapping *range; + int i; + + for (i = 0; i < smmu_idmaps->len; i++) { + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + int bdf = range->input_base; + + /* Table 18 Reserved Memory Range Node */ + + build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */ + /* Length */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20, 2); + build_append_int_noprefix(table_data, 3, 1); /* Revision */ + build_append_int_noprefix(table_data, *id, 4); /* Identifier */ + /* Number of ID mappings */ + build_append_int_noprefix(table_data, 1, 4); + /* Reference to ID Array */ + build_append_int_noprefix(table_data, 28, 4); + + /* RMR specific data */ + + /* Flags */ + build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4); + /* Number of Memory Range Descriptors */ + build_append_int_noprefix(table_data, 1 , 4); + /* Reference to Memory Range Descriptors */ + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4); + build_iort_id_mapping(table_data, bdf, range->id_count, smmu_offset[i], 1); + + /* Table 19 Memory Range Descriptor */ + + /* Physical Range offset */ + build_append_int_noprefix(table_data, 0x8000000, 8); + /* Physical Range length */ + build_append_int_noprefix(table_data, 0x100000, 8); + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + *id += 1; + } +} + /* * Input Output Remapping Table (IORT) * Conforms to "IO Remapping Table System Software on ARM Platforms", @@ -308,19 +655,34 @@ static void build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i, nb_nodes, rc_mapping_count; - const uint32_t iort_node_offset = IORT_NODE_OFFSET; - size_t node_size, smmu_offset = 0; + size_t node_size, *smmu_offset; AcpiIortIdMapping *idmap; + hwaddr base; + int irq, num_smmus = 0; uint32_t id = 0; GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); - AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id, + AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; /* Table 2 The IORT */ acpi_table_begin(&table, table_data); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (vms->smmu_accel_count) { + irq = vms->irqmap[VIRT_SMMU_ACCEL] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU_ACCEL].base; + num_smmus = vms->smmu_accel_count; + } else if (virt_has_smmuv3(vms)) { + irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + base = vms->memmap[VIRT_SMMU].base; + num_smmus = 1; + } + + smmu_offset = g_new0(size_t, num_smmus); + nb_nodes = 2; /* RC, ITS */ + nb_nodes += num_smmus; /* SMMU nodes */ + + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping next_range = {0}; object_child_foreach_recursive(object_get_root(), @@ -342,18 +704,19 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } next_range.input_base = idmap->input_base + idmap->id_count; + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + nb_nodes++; /* RMR node per SMMU */ + } } /* Append the last RC -> ITS ID mapping */ - if (next_range.input_base < 0xFFFF) { - next_range.id_count = 0xFFFF - next_range.input_base; + if (next_range.input_base < 0x10000) { + next_range.id_count = 0x10000 - next_range.input_base; g_array_append_val(its_idmaps, next_range); } - nb_nodes = 3; /* RC, ITS, SMMUv3 */ rc_mapping_count = smmu_idmaps->len + its_idmaps->len; } else { - nb_nodes = 2; /* RC, ITS */ rc_mapping_count = 1; } /* Number of IORT Nodes */ @@ -375,10 +738,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* GIC ITS Identifier Array */ build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); - if (vms->iommu == VIRT_IOMMU_SMMUV3) { - int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; + for (i = 0; i < num_smmus; i++) { + smmu_offset[i] = table_data->len - table.table_offset; - smmu_offset = table_data->len - table.table_offset; /* Table 9 SMMUv3 Format */ build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */ node_size = SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE; @@ -389,7 +751,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) /* Reference to ID Array */ build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4); /* Base address */ - build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8); + build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8); /* Flags */ build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4); build_append_int_noprefix(table_data, 0, 4); /* Reserved */ @@ -400,12 +762,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */ build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */ build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */ + irq += NUM_SMMU_IRQS; build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */ /* DeviceID mapping index (ignored since interrupts are GSIV based) */ build_append_int_noprefix(table_data, 0, 4); /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); } /* Table 17 Root Complex Node */ @@ -438,7 +801,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, 0, 3); /* Reserved */ /* Output Reference */ - if (vms->iommu == VIRT_IOMMU_SMMUV3) { + if (virt_has_smmuv3(vms)) { AcpiIortIdMapping *range; /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ @@ -446,7 +809,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); /* output IORT node is the smmuv3 node */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, smmu_offset); + range->id_count, smmu_offset[i], 0); } /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ @@ -454,11 +817,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); /* output IORT node is the ITS group node (the first node) */ build_iort_id_mapping(table_data, range->input_base, - range->id_count, iort_node_offset); + range->id_count, IORT_NODE_OFFSET, 0); } } else { /* output IORT node is the ITS group node (the first node) */ - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); + build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0); + } + + if (vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL) { + build_iort_rmr_nodes(table_data, smmu_idmaps, smmu_offset, &id); } acpi_table_end(linker, &table); @@ -471,48 +838,34 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * Rev: 1.07 */ static void -build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) +spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { - AcpiTable table = { .sig = "SPCR", .rev = 2, .oem_id = vms->oem_id, - .oem_table_id = vms->oem_table_id }; - - acpi_table_begin(&table, table_data); - - /* Interface Type */ - build_append_int_noprefix(table_data, 3, 1); /* ARM PL011 UART */ - build_append_int_noprefix(table_data, 0, 3); /* Reserved */ - /* Base Address */ - build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 32, 0, 3, - vms->memmap[VIRT_UART].base); - /* Interrupt Type */ - build_append_int_noprefix(table_data, - (1 << 3) /* Bit[3] ARMH GIC interrupt */, 1); - build_append_int_noprefix(table_data, 0, 1); /* IRQ */ - /* Global System Interrupt */ - build_append_int_noprefix(table_data, - vms->irqmap[VIRT_UART] + ARM_SPI_BASE, 4); - build_append_int_noprefix(table_data, 3 /* 9600 */, 1); /* Baud Rate */ - build_append_int_noprefix(table_data, 0 /* No Parity */, 1); /* Parity */ - /* Stop Bits */ - build_append_int_noprefix(table_data, 1 /* 1 Stop bit */, 1); - /* Flow Control */ - build_append_int_noprefix(table_data, - (1 << 1) /* RTS/CTS hardware flow control */, 1); - /* Terminal Type */ - build_append_int_noprefix(table_data, 0 /* VT100 */, 1); - build_append_int_noprefix(table_data, 0, 1); /* Language */ - /* PCI Device ID */ - build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2); - /* PCI Vendor ID */ - build_append_int_noprefix(table_data, 0xffff /* not a PCI device*/, 2); - build_append_int_noprefix(table_data, 0, 1); /* PCI Bus Number */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Device Number */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Function Number */ - build_append_int_noprefix(table_data, 0, 4); /* PCI Flags */ - build_append_int_noprefix(table_data, 0, 1); /* PCI Segment */ - build_append_int_noprefix(table_data, 0, 4); /* Reserved */ + AcpiSpcrData serial = { + .interface_type = 3, /* ARM PL011 UART */ + .base_addr.id = AML_AS_SYSTEM_MEMORY, + .base_addr.width = 32, + .base_addr.offset = 0, + .base_addr.size = 3, + .base_addr.addr = vms->memmap[VIRT_UART].base, + .interrupt_type = (1 << 3),/* Bit[3] ARMH GIC interrupt*/ + .pc_interrupt = 0, /* IRQ */ + .interrupt = (vms->irqmap[VIRT_UART] + ARM_SPI_BASE), + .baud_rate = 3, /* 9600 */ + .parity = 0, /* No Parity */ + .stop_bits = 1, /* 1 Stop bit */ + .flow_control = 1 << 1, /* RTS/CTS hardware flow control */ + .terminal_type = 0, /* VT100 */ + .language = 0, /* Language */ + .pci_device_id = 0xffff, /* not a PCI device*/ + .pci_vendor_id = 0xffff, /* not a PCI device*/ + .pci_bus = 0, + .pci_device = 0, + .pci_function = 0, + .pci_flags = 0, + .pci_segment = 0, + }; - acpi_table_end(linker, &table); + build_spcr(table_data, linker, &serial, 2, vms->oem_id, vms->oem_table_id); } /* @@ -700,14 +1053,50 @@ static void build_append_gicr(GArray *table_data, uint64_t base, uint32_t size) build_append_int_noprefix(table_data, size, 4); /* Discovery Range Length */ } +static uint32_t virt_acpi_get_gicc_flags(CPUState *cpu, VirtMachineState *vms) +{ + /* can only exist in 'enabled' state */ + if (!vms->cpu_hotplug_enabled) { + return 1; + } + + /* + * ARM GIC CPU Interface can be 'online-capable' or 'enabled' at boot. We + * MUST set 'online-capable' bit for all hotpluggable CPUs. + * Change Link: https://bugzilla.tianocore.org/show_bug.cgi?id=3706 + * + * UEFI ACPI Specification 6.5 + * Section: 5.2.12.14. GIC CPU Interface (GICC) Structure + * Table: 5.37 GICC CPU Interface Flags + * Link: https://uefi.org/specs/ACPI/6.5 + * + * Cold-booted CPUs, except for the first/boot CPU, SHOULD be allowed to be + * hot(un)plug as well but for this to happen these MUST have + * 'online-capable' bit set. Later creates compatibility problem with legacy + * OS as it might ignore online-capable' bits during boot time and hence + * some CPUs might not get detected. To fix this MADT GIC CPU interface flag + * should be allowed to have both bits set i.e. 'online-capable' and + * 'Enabled' bits together. This change will require UEFI ACPI standard + * change. Till this happens exposing all cold-booted CPUs as 'enabled' only + * + */ + return cpu && cpu->cold_booted ? 1 : (1 << 3); +} + static void build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { int i; VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); + MachineState *ms = MACHINE(vms); const MemMapEntry *memmap = vms->memmap; AcpiTable table = { .sig = "APIC", .rev = 4, .oem_id = vms->oem_id, .oem_table_id = vms->oem_table_id }; + unsigned int max_cpus = ms->smp.max_cpus; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } acpi_table_begin(&table, table_data); /* Local Interrupt Controller Address */ @@ -726,12 +1115,13 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, vms->gic_version, 1); build_append_int_noprefix(table_data, 0, 3); /* Reserved */ - for (i = 0; i < MACHINE(vms)->smp.cpus; i++) { - ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(i)); + for (i = 0; i < max_cpus; i++) { + CPUState *cpu = qemu_get_possible_cpu(i); uint64_t physical_base_address = 0, gich = 0, gicv = 0; uint32_t vgic_interrupt = vms->virt ? ARCH_GIC_MAINT_IRQ : 0; - uint32_t pmu_interrupt = arm_feature(&armcpu->env, ARM_FEATURE_PMU) ? - VIRTUAL_PMU_IRQ : 0; + uint32_t pmu_interrupt = vms->pmu ? VIRTUAL_PMU_IRQ : 0; + uint32_t flags = virt_acpi_get_gicc_flags(cpu, vms); + uint64_t mpidr = qemu_get_cpu_archid(i); if (vms->gic_version == VIRT_GIC_VERSION_2) { physical_base_address = memmap[VIRT_GIC_CPU].base; @@ -746,7 +1136,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, i, 4); /* GIC ID */ build_append_int_noprefix(table_data, i, 4); /* ACPI Processor UID */ /* Flags */ - build_append_int_noprefix(table_data, 1, 4); /* Enabled */ + build_append_int_noprefix(table_data, flags, 4); /* Parking Protocol Version */ build_append_int_noprefix(table_data, 0, 4); /* Performance Interrupt GSIV */ @@ -760,7 +1150,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) build_append_int_noprefix(table_data, vgic_interrupt, 4); build_append_int_noprefix(table_data, 0, 8); /* GICR Base Address*/ /* MPIDR */ - build_append_int_noprefix(table_data, armcpu->mp_affinity, 8); + build_append_int_noprefix(table_data, mpidr, 8); /* Processor Power Efficiency Class */ build_append_int_noprefix(table_data, 0, 1); /* Reserved */ @@ -837,6 +1227,55 @@ static void build_fadt_rev6(GArray *table_data, BIOSLinker *linker, build_fadt(table_data, linker, &fadt, vms->oem_id, vms->oem_table_id); } +static void build_virt_osc_method(Aml *scope, VirtMachineState *vms) +{ + Aml *if_uuid, *else_uuid, *if_rev, *if_caps_masked, *method; + Aml *a_cdw1 = aml_name("CDW1"); + Aml *a_cdw2 = aml_local(0); + + method = aml_method("_OSC", 4, AML_NOTSERIALIZED); + aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1")); + + /* match UUID */ + if_uuid = aml_if(aml_equal( + aml_arg(0), aml_touuid("0811B06E-4A27-44F9-8D60-3CBBC22E7B48"))); + + aml_append(if_uuid, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2")); + aml_append(if_uuid, aml_store(aml_name("CDW2"), a_cdw2)); + + /* check unknown revision in arg(1) */ + if_rev = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1)))); + /* set revision error bits, DWORD1 Bit[3] */ + aml_append(if_rev, aml_or(a_cdw1, aml_int(0x08), a_cdw1)); + aml_append(if_uuid, if_rev); + + /* + * check support for vCPU hotplug type(=enabled) platform-wide capability + * in DWORD2 as sepcified in the below ACPI Specification ECR, + * # https://bugzilla.tianocore.org/show_bug.cgi?id=4481 + */ + if (vms->acpi_dev) { + aml_append(if_uuid, aml_and(a_cdw2, aml_int(0x800000), a_cdw2)); + /* check if OSPM specified hotplug capability bits were masked */ + if_caps_masked = aml_if(aml_lnot(aml_equal(aml_name("CDW2"), a_cdw2))); + aml_append(if_caps_masked, aml_or(a_cdw1, aml_int(0x10), a_cdw1)); + aml_append(if_uuid, if_caps_masked); + } + aml_append(if_uuid, aml_store(a_cdw2, aml_name("CDW2"))); + + aml_append(method, if_uuid); + else_uuid = aml_else(); + + /* set unrecognized UUID error bits, DWORD1 Bit[2] */ + aml_append(else_uuid, aml_or(a_cdw1, aml_int(4), a_cdw1)); + aml_append(method, else_uuid); + + aml_append(method, aml_return(aml_arg(3))); + aml_append(scope, method); + + return; +} + /* DSDT */ static void build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) @@ -858,7 +1297,22 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - acpi_dsdt_add_cpus(scope, vms); + + if (vms->cpu_hotplug_enabled) { + CPUHotplugFeatures opts = { + .acpi_1_compatible = false, + .has_legacy_cphp = false + }; + + build_cpus_aml(scope, ms, opts, NULL, virt_acpi_dsdt_cpu_cppc, + memmap[VIRT_CPUHP_ACPI].base, + "\\_SB", NULL, AML_SYSTEM_MEMORY); + } else { + acpi_dsdt_add_cpus(scope, vms); + } + + build_virt_osc_method(scope, vms); + acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); if (vmc->acpi_expose_flash) { @@ -868,6 +1322,15 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) acpi_dsdt_add_virtio(scope, &memmap[VIRT_MMIO], (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS); acpi_dsdt_add_pci(scope, memmap, irqmap[VIRT_PCIE] + ARM_SPI_BASE, vms); + + if (virtcca_cvm_enabled()) { + int kae_num = tmm_get_kae_num(); + for (int i = 0; i < kae_num; i++) { + acpi_dsdt_add_hisi_sec(scope, &memmap[VIRT_KAE_DEVICE], i); + acpi_dsdt_add_hisi_hpre(scope, &memmap[VIRT_KAE_DEVICE], i); + } + } + if (vms->acpi_dev) { build_ged_aml(scope, "\\_SB."GED_DEVICE, HOTPLUG_HANDLER(vms->acpi_dev), @@ -951,7 +1414,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) if (!vmc->no_cpu_topology) { acpi_add_table(table_offsets, tables_blob); - build_pptt(tables_blob, tables->linker, ms, + build_pptt_arm(tables_blob, tables->linker, ms, vms->oem_id, vms->oem_table_id); } @@ -969,7 +1432,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) } acpi_add_table(table_offsets, tables_blob); - build_spcr(tables_blob, tables->linker, vms); + spcr_setup(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); build_dbg2(tables_blob, tables->linker, vms); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index be2856c018aa14c6ffda283cd7f99fb0d8e7a803..2957b844d28e5307cbb6be8cde001d08c3ffbeae 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -32,6 +32,7 @@ #include "qemu/datadir.h" #include "qemu/units.h" #include "qemu/option.h" +#include "qemu/log.h" #include "monitor/qdev.h" #include "hw/sysbus.h" #include "hw/arm/boot.h" @@ -45,6 +46,8 @@ #include "sysemu/device_tree.h" #include "sysemu/numa.h" #include "sysemu/runstate.h" +#include "sysemu/reset.h" +#include "sysemu/sysemu.h" #include "sysemu/tpm.h" #include "sysemu/tcg.h" #include "sysemu/kvm.h" @@ -77,10 +80,12 @@ #include "hw/mem/pc-dimm.h" #include "hw/mem/nvdimm.h" #include "hw/acpi/generic_event_device.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/virtio/virtio-md-pci.h" #include "hw/virtio/virtio-iommu.h" #include "hw/char/pl011.h" #include "qemu/guest-random.h" +#include "qapi/qmp/qdict.h" #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ @@ -150,13 +155,18 @@ static const MemMapEntry base_memmap[] = { [VIRT_FW_CFG] = { 0x09020000, 0x00000018 }, [VIRT_GPIO] = { 0x09030000, 0x00001000 }, [VIRT_SECURE_UART] = { 0x09040000, 0x00001000 }, - [VIRT_SMMU] = { 0x09050000, 0x00020000 }, + [VIRT_SMMU] = { 0x09050000, SMMU_IO_LEN }, [VIRT_PCDIMM_ACPI] = { 0x09070000, MEMORY_HOTPLUG_IO_LEN }, [VIRT_ACPI_GED] = { 0x09080000, ACPI_GED_EVT_SEL_LEN }, [VIRT_NVDIMM_ACPI] = { 0x09090000, NVDIMM_ACPI_IO_LEN}, [VIRT_PVTIME] = { 0x090a0000, 0x00010000 }, [VIRT_SECURE_GPIO] = { 0x090b0000, 0x00001000 }, + [VIRT_CPUHP_ACPI] = { 0x090c0000, ACPI_CPU_HOTPLUG_REG_LEN}, [VIRT_MMIO] = { 0x0a000000, 0x00000200 }, + /* In the virtCCA scenario, this space is used for MSI interrupt mapping */ + [VIRT_CVM_MSI] = { 0x0a001000, 0x00fff000 }, + [VIRT_CPUFREQ] = { 0x0b000000, 0x00010000 }, + [VIRT_SMMU_ACCEL] = { 0x0b010000, 0x00ff0000}, /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */ [VIRT_PLATFORM_BUS] = { 0x0c000000, 0x02000000 }, [VIRT_SECURE_MEM] = { 0x0e000000, 0x01000000 }, @@ -167,6 +177,12 @@ static const MemMapEntry base_memmap[] = { [VIRT_MEM] = { GiB, LEGACY_RAMLIMIT_BYTES }, }; +void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size) +{ + *mmio_start = base_memmap[VIRT_PCIE_MMIO].base; + *mmio_size = base_memmap[VIRT_PCIE_MMIO].size; +} + /* * Highmem IO Regions: This memory map is floating, located after the RAM. * Each MemMapEntry base (GPA) will be dynamically computed, depending on the @@ -202,6 +218,7 @@ static const int a15irqmap[] = { [VIRT_GIC_V2M] = 48, /* ...to 48 + NUM_GICV2M_SPIS - 1 */ [VIRT_SMMU] = 74, /* ...to 74 + NUM_SMMU_IRQS - 1 */ [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ + [VIRT_SMMU_ACCEL] = 200, }; static const char *valid_cpus[] = { @@ -220,10 +237,19 @@ static const char *valid_cpus[] = { #endif ARM_CPU_TYPE_NAME("cortex-a53"), ARM_CPU_TYPE_NAME("cortex-a57"), + ARM_CPU_TYPE_NAME("Kunpeng-920"), + ARM_CPU_TYPE_NAME("FT-2000+"), + ARM_CPU_TYPE_NAME("Tengyun-S2500"), ARM_CPU_TYPE_NAME("host"), ARM_CPU_TYPE_NAME("max"), }; +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid); +static int virt_get_socket_id(const MachineState *ms, int cpu_index); +static int virt_get_cluster_id(const MachineState *ms, int cpu_index); +static int virt_get_core_id(const MachineState *ms, int cpu_index); +static int virt_get_thread_id(const MachineState *ms, int cpu_index); + static bool cpu_type_valid(const char *cpu) { int i; @@ -271,8 +297,16 @@ static void create_fdt(VirtMachineState *vms) /* /chosen must exist for load_dtb to fill in necessary properties later */ qemu_fdt_add_subnode(fdt, "/chosen"); + + g_autofree char *kvm_type = NULL; + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + } if (vms->dtb_randomness) { - create_randomness(ms, "/chosen"); + if (!(kvm_type && !strcmp(kvm_type, "cvm"))) { + create_randomness(ms, "/chosen"); + } } if (vms->secure) { @@ -378,6 +412,122 @@ static void fdt_add_timer_nodes(const VirtMachineState *vms) INTID_TO_PPI(ARCH_TIMER_NS_EL2_IRQ), irqflags); } +/* + * In CLIDR_EL1 exposed to guest by the hypervisor, L1 cache type + * maybe unified or seperate ins and data. We need to read the + * guest visable CLIDR_EL1 and check L1 cache type. + */ +bool cpu_l1_cache_unified(int cpu) +{ + bool unified = false; + uint64_t clidr; + ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); + CPUState *cs = CPU(armcpu); + int ret; + + if (kvm_enabled()) { + struct kvm_one_reg reg = { + .id = ARM64_REG_CLIDR_EL1, + .addr = (uintptr_t)&clidr + }; + + ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); + if (ret) { + error_setg(&error_fatal, "Get vCPU clidr from KVM failed:%d", ret); + return unified; + } + + if (CLIDR_CTYPE(clidr, 1) == CTYPE_UNIFIED) { + unified = true; + } + } + + return unified; +} + +static void fdt_add_l3cache_nodes(const VirtMachineState *vms) +{ + int i; + const MachineState *ms = MACHINE(vms); + int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads; + int sockets = (ms->smp.cpus + cpus_per_socket - 1) / cpus_per_socket; + + for (i = 0; i < sockets; i++) { + char *nodename = g_strdup_printf("/cpus/l3-cache%d", i); + + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); + qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true"); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-level", 3); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L3CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L3CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L3CACHE_SETS); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", + qemu_fdt_alloc_phandle(ms->fdt)); + g_free(nodename); + } +} + +static void fdt_add_l2cache_nodes(const VirtMachineState *vms) +{ + const MachineState *ms = MACHINE(vms); + int cpus_per_socket = ms->smp.clusters * ms->smp.cores * ms->smp.threads; + int cpu; + + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + char *next_path = g_strdup_printf("/cpus/l3-cache%d", + cpu / cpus_per_socket); + char *nodename = g_strdup_printf("/cpus/l2-cache%d", cpu); + + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "cache-unified", "true"); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cache"); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L2CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L2CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L2CACHE_SETS); + qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", + next_path); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", + qemu_fdt_alloc_phandle(ms->fdt)); + + g_free(next_path); + g_free(nodename); + } +} + +static void fdt_add_l1cache_prop(const VirtMachineState *vms, + char *nodename, int cpu) +{ + const MachineState *ms = MACHINE(vms); + char *next_path = g_strdup_printf("/cpus/l2-cache%d", cpu); + bool unified_l1 = cpu_l1_cache_unified(0); + + if (unified_l1) { + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-size", ARM_L1CACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-line-size", + ARM_L1CACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "cache-sets", ARM_L1CACHE_SETS); + } else { + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-size", + ARM_L1DCACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-line-size", + ARM_L1DCACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "d-cache-sets", + ARM_L1DCACHE_SETS); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-size", + ARM_L1ICACHE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-line-size", + ARM_L1ICACHE_LINE_SIZE); + qemu_fdt_setprop_cell(ms->fdt, nodename, "i-cache-sets", + ARM_L1ICACHE_SETS); + } + qemu_fdt_setprop_phandle(ms->fdt, nodename, "next-level-cache", next_path); + + g_free(next_path); +} + static void fdt_add_cpu_nodes(const VirtMachineState *vms) { int cpu; @@ -412,6 +562,11 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#address-cells", addr_cells); qemu_fdt_setprop_cell(ms->fdt, "/cpus", "#size-cells", 0x0); + if (!vmc->no_cpu_topology) { + fdt_add_l3cache_nodes(vms); + fdt_add_l2cache_nodes(vms); + } + for (cpu = smp_cpus - 1; cpu >= 0; cpu--) { char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu); ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); @@ -441,6 +596,7 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) } if (!vmc->no_cpu_topology) { + fdt_add_l1cache_prop(vms, nodename, cpu); qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", qemu_fdt_alloc_phandle(ms->fdt)); } @@ -644,7 +800,7 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) DeviceState *dev; MachineState *ms = MACHINE(vms); int irq = vms->irqmap[VIRT_ACPI_GED]; - uint32_t event = ACPI_GED_PWR_DOWN_EVT; + uint32_t event = ACPI_GED_PWR_DOWN_EVT | ACPI_GED_CPU_HOTPLUG_EVT; if (ms->ram_slots) { event |= ACPI_GED_MEM_HOTPLUG_EVT; @@ -660,11 +816,22 @@ static inline DeviceState *create_acpi_ged(VirtMachineState *vms) sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vms->memmap[VIRT_ACPI_GED].base); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 1, vms->memmap[VIRT_PCDIMM_ACPI].base); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, vms->memmap[VIRT_CPUHP_ACPI].base); sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(vms->gic, irq)); return dev; } +static void virt_add_gic_cpuhp_notifier(VirtMachineState *vms) +{ + MachineClass *mc = MACHINE_GET_CLASS(vms); + + if (mc->has_hotpluggable_cpus && vms->gic_version >= VIRT_GIC_VERSION_3) { + Notifier *cpuhp_notifier = gicv3_cpuhp_notifier(vms->gic); + notifier_list_add(&vms->cpuhp_notifiers, cpuhp_notifier); + } +} + static void create_its(VirtMachineState *vms) { const char *itsclass = its_class_name(); @@ -713,6 +880,107 @@ static void create_v2m(VirtMachineState *vms) vms->msi_controller = VIRT_MSI_CTRL_GICV2M; } +/* + * Mapping from the output timer irq lines from the CPU to the GIC PPI inputs + * we use for the virt board. + */ +const int timer_irq[] = { + [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ, + [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ, + [GTIMER_HYP] = ARCH_TIMER_NS_EL2_IRQ, + [GTIMER_SEC] = ARCH_TIMER_S_EL1_IRQ, +}; + +static void unwire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) +{ + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + DeviceState *cpudev = DEVICE(cs); + DeviceState *gicdev = vms->gic; + int cpu = CPU(cs)->cpu_index; + int type = vms->gic_version; + int irq; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + + for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { + qdev_disconnect_gpio_out_named(cpudev, NULL, irq); + } + + if (type != VIRT_GIC_VERSION_2) { + qdev_disconnect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", + 0); + } else if (vms->virt) { + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 4 * max_cpus); + } + + /* + * RFC: Question: This currently does not takes care of intimating the + * devices which might be sitting on system bus. Do we need a + * sysbus_disconnect_irq() which also does the job of notification beside + * disconnection? + */ + qdev_disconnect_gpio_out_named(cpudev, "pmu-interrupt", 0); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, cpu); + qdev_disconnect_gpio_out_named(gicdev, + SYSBUS_DEVICE_GPIO_IRQ, cpu + max_cpus); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 2 * max_cpus); + qdev_disconnect_gpio_out_named(gicdev, SYSBUS_DEVICE_GPIO_IRQ, + cpu + 3 * max_cpus); +} + +static void wire_gic_cpu_irqs(VirtMachineState *vms, CPUState *cs) +{ + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + DeviceState *cpudev = DEVICE(cs); + DeviceState *gicdev = vms->gic; + int cpu = CPU(cs)->cpu_index; + int type = vms->gic_version; + SysBusDevice *gicbusdev; + int intidbase; + int irq; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + + intidbase = NUM_IRQS + cpu * GIC_INTERNAL; + + for (irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { + qdev_connect_gpio_out(cpudev, irq, + qdev_get_gpio_in(gicdev, + intidbase + timer_irq[irq])); + } + + gicbusdev = SYS_BUS_DEVICE(gicdev); + if (type != VIRT_GIC_VERSION_2) { + qemu_irq qirq = qdev_get_gpio_in(gicdev, + intidbase + ARCH_GIC_MAINT_IRQ); + qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", + 0, qirq); + } else if (vms->virt) { + qemu_irq qirq = qdev_get_gpio_in(gicdev, + intidbase + ARCH_GIC_MAINT_IRQ); + sysbus_connect_irq(gicbusdev, cpu + 4 * max_cpus, qirq); + } + + qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0, + qdev_get_gpio_in(gicdev, + intidbase + VIRTUAL_PMU_IRQ)); + sysbus_connect_irq(gicbusdev, cpu, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); + sysbus_connect_irq(gicbusdev, cpu + max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_FIQ)); + sysbus_connect_irq(gicbusdev, cpu + 2 * max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ)); + sysbus_connect_irq(gicbusdev, cpu + 3 * max_cpus, + qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ)); +} + static void create_gic(VirtMachineState *vms, MemoryRegion *mem) { MachineState *ms = MACHINE(vms); @@ -721,9 +989,14 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) const char *gictype; int i; unsigned int smp_cpus = ms->smp.cpus; + unsigned int max_cpus = ms->smp.max_cpus; uint32_t nb_redist_regions = 0; int revision; + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } + if (vms->gic_version == VIRT_GIC_VERSION_2) { gictype = gic_class_name(); } else { @@ -745,7 +1018,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) } vms->gic = qdev_new(gictype); qdev_prop_set_uint32(vms->gic, "revision", revision); - qdev_prop_set_uint32(vms->gic, "num-cpu", smp_cpus); + qdev_prop_set_uint32(vms->gic, "num-cpu", max_cpus); /* Note that the num-irq property counts both internal and external * interrupts; there are always 32 of the former (mandated by GIC spec). */ @@ -757,7 +1030,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) if (vms->gic_version != VIRT_GIC_VERSION_2) { QList *redist_region_count; uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST); - uint32_t redist0_count = MIN(smp_cpus, redist0_capacity); + uint32_t redist0_count = MIN(max_cpus, redist0_capacity); nb_redist_regions = virt_gicv3_redist_region_count(vms); @@ -768,7 +1041,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) virt_redist_capacity(vms, VIRT_HIGH_GIC_REDIST2); qlist_append_int(redist_region_count, - MIN(smp_cpus - redist0_count, redist1_capacity)); + MIN(max_cpus - redist0_count, redist1_capacity)); } qdev_prop_set_array(vms->gic, "redist-region-count", redist_region_count); @@ -808,46 +1081,7 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) * and the GIC's IRQ/FIQ/VIRQ/VFIQ interrupt outputs to the CPU's inputs. */ for (i = 0; i < smp_cpus; i++) { - DeviceState *cpudev = DEVICE(qemu_get_cpu(i)); - int intidbase = NUM_IRQS + i * GIC_INTERNAL; - /* Mapping from the output timer irq lines from the CPU to the - * GIC PPI inputs we use for the virt board. - */ - const int timer_irq[] = { - [GTIMER_PHYS] = ARCH_TIMER_NS_EL1_IRQ, - [GTIMER_VIRT] = ARCH_TIMER_VIRT_IRQ, - [GTIMER_HYP] = ARCH_TIMER_NS_EL2_IRQ, - [GTIMER_SEC] = ARCH_TIMER_S_EL1_IRQ, - }; - - for (unsigned irq = 0; irq < ARRAY_SIZE(timer_irq); irq++) { - qdev_connect_gpio_out(cpudev, irq, - qdev_get_gpio_in(vms->gic, - intidbase + timer_irq[irq])); - } - - if (vms->gic_version != VIRT_GIC_VERSION_2) { - qemu_irq irq = qdev_get_gpio_in(vms->gic, - intidbase + ARCH_GIC_MAINT_IRQ); - qdev_connect_gpio_out_named(cpudev, "gicv3-maintenance-interrupt", - 0, irq); - } else if (vms->virt) { - qemu_irq irq = qdev_get_gpio_in(vms->gic, - intidbase + ARCH_GIC_MAINT_IRQ); - sysbus_connect_irq(gicbusdev, i + 4 * smp_cpus, irq); - } - - qdev_connect_gpio_out_named(cpudev, "pmu-interrupt", 0, - qdev_get_gpio_in(vms->gic, intidbase - + VIRTUAL_PMU_IRQ)); - - sysbus_connect_irq(gicbusdev, i, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); - sysbus_connect_irq(gicbusdev, i + smp_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_FIQ)); - sysbus_connect_irq(gicbusdev, i + 2 * smp_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ)); - sysbus_connect_irq(gicbusdev, i + 3 * smp_cpus, - qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ)); + wire_gic_cpu_irqs(vms, qemu_get_cpu(i)); } fdt_add_gic_node(vms); @@ -857,6 +1091,9 @@ static void create_gic(VirtMachineState *vms, MemoryRegion *mem) } else if (vms->gic_version == VIRT_GIC_VERSION_2) { create_v2m(vms); } + + /* add GIC CPU hot(un)plug update notifier */ + virt_add_gic_cpuhp_notifier(vms); } static void create_uart(const VirtMachineState *vms, int uart, @@ -907,6 +1144,16 @@ static void create_uart(const VirtMachineState *vms, int uart, g_free(nodename); } +static void create_cpufreq(const VirtMachineState *vms, MemoryRegion *mem) +{ + hwaddr base = vms->memmap[VIRT_CPUFREQ].base; + DeviceState *dev = qdev_new("cpufreq"); + SysBusDevice *s = SYS_BUS_DEVICE(dev); + + sysbus_realize_and_unref(s, &error_fatal); + memory_region_add_subregion(mem, base, sysbus_mmio_get_region(s, 0)); +} + static void create_rtc(const VirtMachineState *vms) { char *nodename; @@ -936,6 +1183,7 @@ static void virt_powerdown_req(Notifier *n, void *opaque) { VirtMachineState *s = container_of(n, VirtMachineState, powerdown_notifier); + qemu_log("send powerdown to vm.\n"); if (s->acpi_dev) { acpi_send_event(s->acpi_dev, ACPI_POWER_DOWN_STATUS); } else { @@ -1160,6 +1408,9 @@ static void virt_flash_map1(PFlashCFI01 *flash, qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); + if (virtcca_cvm_enabled()) { + return; + } memory_region_add_subregion(sysmem, base, sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0)); @@ -1195,6 +1446,10 @@ static void virt_flash_fdt(VirtMachineState *vms, MachineState *ms = MACHINE(vms); char *nodename; + if (virtcca_cvm_enabled()) { + return; + } + if (sysmem == secure_sysmem) { /* Report both flash devices as a single node in the DT */ nodename = g_strdup_printf("/flash@%" PRIx64, flashbase); @@ -1230,6 +1485,23 @@ static void virt_flash_fdt(VirtMachineState *vms, } } +static bool virt_confidential_firmware_init(VirtMachineState *vms, + MemoryRegion *sysmem) +{ + MemoryRegion *fw_ram; + hwaddr fw_base = vms->memmap[VIRT_FLASH].base; + hwaddr fw_size = vms->memmap[VIRT_FLASH].size; + + if (!MACHINE(vms)->firmware) { + return false; + } + + fw_ram = g_new(MemoryRegion, 1); + memory_region_init_ram(fw_ram, NULL, "fw_ram", fw_size, NULL); + memory_region_add_subregion(sysmem, fw_base, fw_ram); + return true; +} + static bool virt_firmware_init(VirtMachineState *vms, MemoryRegion *sysmem, MemoryRegion *secure_sysmem) @@ -1248,6 +1520,10 @@ static bool virt_firmware_init(VirtMachineState *vms, pflash_blk0 = pflash_cfi01_get_blk(vms->flash[0]); + if (virtcca_cvm_enabled()) { + return virt_confidential_firmware_init(vms, sysmem); + } + bios_name = MACHINE(vms)->firmware; if (bios_name) { char *fname; @@ -1289,7 +1565,7 @@ static FWCfgState *create_fw_cfg(const VirtMachineState *vms, AddressSpace *as) char *nodename; fw_cfg = fw_cfg_init_mem_wide(base + 8, base, 8, base + 16, as); - fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, (uint16_t)ms->smp.cpus); + fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -1775,6 +2051,33 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits) vms->memmap[i] = base_memmap[i]; } + /* fix VIRT_MEM range */ + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + + if (!strcmp(kvm_type, "cvm")) { + /* support kae vf device tree nodes */ + vms->memmap[VIRT_PCIE_MMIO] = (MemMapEntry) { 0x10000000, 0x2edf0000 }; + vms->memmap[VIRT_KAE_DEVICE] = (MemMapEntry) { 0x3edf0000, 0x00200000 }; + uint64_t tmi_version = 0; + int ret = -1; + if (kvm_enabled()) { + ret = kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version); + } + if (ret < 0) { + warn_report("can not get tmi version"); + } + if (tmi_version < MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM) { + vms->memmap[VIRT_MEM].base = 3 * GiB; + } + virtcca_cvm_gpa_start = vms->memmap[VIRT_MEM].base; + vms->memmap[VIRT_MEM].size = ms->ram_size; + info_report("[qemu] fix VIRT_MEM range 0x%llx - 0x%llx\n", (unsigned long long)(vms->memmap[VIRT_MEM].base), + (unsigned long long)(vms->memmap[VIRT_MEM].base + ms->ram_size)); + } + } + if (ms->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported number of memory slots: %"PRIu64, ms->ram_slots); @@ -1962,12 +2265,15 @@ static void finalize_gic_version(VirtMachineState *vms) */ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) { + CPUArchIdList *possible_cpus = vms->parent.possible_cpus; int max_cpus = MACHINE(vms)->smp.max_cpus; - bool aarch64, pmu, steal_time; + MachineState *ms = MACHINE(vms); + bool aarch64, steal_time; CPUState *cpu; + int n; aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL); - pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL); + vms->pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL); steal_time = object_property_get_bool(OBJECT(first_cpu), "kvm-steal-time", NULL); @@ -1994,8 +2300,13 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) memory_region_add_subregion(sysmem, pvtime_reg_base, pvtime); } - CPU_FOREACH(cpu) { - if (pmu) { + for (n = 0; n < possible_cpus->len; n++) { + cpu = qemu_get_possible_cpu(n); + if (!qemu_present_cpu(cpu)) { + continue; + } + + if (vms->pmu) { assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU)); if (kvm_irqchip_in_kernel()) { kvm_arm_pmu_set_irq(cpu, VIRTUAL_PMU_IRQ); @@ -2020,6 +2331,207 @@ static void virt_cpu_post_init(VirtMachineState *vms, MemoryRegion *sysmem) } } } + + if (kvm_enabled() || tcg_enabled()) { + for (n = 0; n < possible_cpus->len; n++) { + cpu = qemu_get_possible_cpu(n); + if (!qemu_present_cpu(cpu)) { + continue; + } + + /* + * Now, GIC has been sized with possible CPUs and we dont require + * disabled vCPU objects to be represented in the QOM. Release the + * disabled ARMCPU objects earlier used during init for pre-sizing. + * + * We fake to the guest through ACPI about the presence(_STA.PRES=1) + * of these non-existent vCPUs at VMM/qemu and present these as + * disabled vCPUs(_STA.ENA=0) so that they cant be used. These vCPUs + * can be later added to the guest through hotplug exchanges when + * ARMCPU objects are created back again using 'device_add' QMP + * command. + */ + /* + * RFC: Question: Other approach could've been to keep them forever + * and release it only once when qemu exits as part of finalize or + * when new vCPU is hotplugged. In the later old could be released + * for the newly created object for the same vCPU? + */ + if (!qemu_enabled_cpu(cpu)) { + CPUArchId *cpu_slot; + cpu_slot = virt_find_cpu_slot(ms, cpu->cpu_index); + cpu_slot->cpu = NULL; + object_unref(OBJECT(cpu)); + } + } + } +} + +static void virt_cpu_set_properties(Object *cpuobj, const CPUArchId *cpu_slot, + Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + VirtMachineState *vms = VIRT_MACHINE(ms); + Error *local_err = NULL; + VirtMachineClass *vmc; + + vmc = VIRT_MACHINE_GET_CLASS(ms); + + /* now, set the cpu object property values */ + numa_cpu_pre_plug(cpu_slot, DEVICE(cpuobj), &local_err); + if (local_err) { + goto out; + } + + object_property_set_int(cpuobj, "mp-affinity", cpu_slot->arch_id, NULL); + + if (!vms->secure) { + object_property_set_bool(cpuobj, "has_el3", false, NULL); + } + + if (!vms->virt && object_property_find(cpuobj, "has_el2")) { + object_property_set_bool(cpuobj, "has_el2", false, NULL); + } + + if (vmc->kvm_no_adjvtime && + object_property_find(cpuobj, "kvm-no-adjvtime")) { + object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL); + } + + if (vmc->no_kvm_steal_time && + object_property_find(cpuobj, "kvm-steal-time")) { + object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL); + } + + if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) { + object_property_set_bool(cpuobj, "pmu", false, NULL); + } + + if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) { + object_property_set_bool(cpuobj, "lpa2", false, NULL); + } + + if (object_property_find(cpuobj, "reset-cbar")) { + object_property_set_int(cpuobj, "reset-cbar", + vms->memmap[VIRT_CPUPERIPHS].base, + &local_err); + if (local_err) { + goto out; + } + } + + /* link already initialized {secure,tag}-memory regions to this cpu */ + object_property_set_link(cpuobj, "memory", OBJECT(vms->sysmem), &local_err); + if (local_err) { + goto out; + } + + if (vms->secure) { + object_property_set_link(cpuobj, "secure-memory", + OBJECT(vms->secure_sysmem), &local_err); + if (local_err) { + goto out; + } + } + + if (vms->mte) { + if (!object_property_find(cpuobj, "tag-memory")) { + error_setg(&local_err, "MTE requested, but not supported " + "by the guest CPU"); + if (local_err) { + goto out; + } + } + + object_property_set_link(cpuobj, "tag-memory", OBJECT(vms->tag_sysmem), + &local_err); + if (local_err) { + goto out; + } + + if (vms->secure) { + object_property_set_link(cpuobj, "secure-tag-memory", + OBJECT(vms->secure_tag_sysmem), + &local_err); + if (local_err) { + goto out; + } + } + } + + /* + * RFC: Question: this must only be called for the hotplugged cpus. For the + * cold booted secondary cpus this is being taken care in arm_load_kernel() + * in boot.c. Perhaps we should remove that code now? + */ + if (vms->psci_conduit != QEMU_PSCI_CONDUIT_DISABLED) { + object_property_set_int(cpuobj, "psci-conduit", vms->psci_conduit, + &local_err); + if (local_err) { + goto out; + } + + /* Secondary CPUs start in PSCI powered-down state */ + if (CPU(cpuobj)->cpu_index > 0) { + object_property_set_bool(cpuobj, "start-powered-off", true, NULL); + } + } + +out: + if (local_err) { + error_propagate(errp, local_err); + } + return; +} + +static void fdt_add_hisi_sec_nodes(const VirtMachineState *vms, int dev_id) +{ + const MachineState *ms = MACHINE(vms); + hwaddr size = 0x10000; + + /* + * Calculate the base address for the sec device node. + * Each device group contains one sec device and one hpre device,spaced by 2 * size. + */ + hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size; + char *nodename; + + tmm_set_sec_addr(base, dev_id); + + nodename = g_strdup_printf("/hisi-sec@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-sec-vf"); + qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + g_free(nodename); +} + +static void fdt_add_hisi_hpre_nodes(const VirtMachineState *vms, int dev_id) +{ + const MachineState *ms = MACHINE(vms); + hwaddr size = 0x10000; + + /* + * Calculate the base address for the hpre device node. + * Each hpre device follows the corresponding sec device by an additional offset of size. + */ + hwaddr base = vms->memmap[VIRT_KAE_DEVICE].base + dev_id * 2 * size + size; + char *nodename; + + tmm_set_hpre_addr(base, dev_id); + + nodename = g_strdup_printf("/hisi-hpre@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "hisilicon,hip07-hpre-vf"); + qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + g_free(nodename); +} + +static void fdt_add_all_hisi_nodes(const VirtMachineState *vms, int dev_id) +{ + for (int i = 0; i < dev_id; i++) { + fdt_add_hisi_sec_nodes(vms, i); + fdt_add_hisi_hpre_nodes(vms, i); + } } static void machvirt_init(MachineState *machine) @@ -2028,22 +2540,25 @@ static void machvirt_init(MachineState *machine) VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(machine); MachineClass *mc = MACHINE_GET_CLASS(machine); const CPUArchIdList *possible_cpus; - MemoryRegion *sysmem = get_system_memory(); + MemoryRegion *secure_tag_sysmem = NULL; MemoryRegion *secure_sysmem = NULL; MemoryRegion *tag_sysmem = NULL; - MemoryRegion *secure_tag_sysmem = NULL; + MemoryRegion *sysmem; int n, virt_max_cpus; bool firmware_loaded; bool aarch64 = true; bool has_ged = !vmc->no_ged; unsigned int smp_cpus = machine->smp.cpus; unsigned int max_cpus = machine->smp.max_cpus; + ObjectClass *cpu_class; if (!cpu_type_valid(machine->cpu_type)) { error_report("mach-virt: CPU type %s not supported", machine->cpu_type); exit(1); } + finalize_gic_version(vms); + possible_cpus = mc->possible_cpu_arch_ids(machine); /* @@ -2070,10 +2585,7 @@ static void machvirt_init(MachineState *machine) virt_set_memmap(vms, pa_bits); } - /* We can probe only here because during property set - * KVM is not available yet - */ - finalize_gic_version(vms); + sysmem = vms->sysmem = get_system_memory(); if (vms->secure) { /* @@ -2082,7 +2594,7 @@ static void machvirt_init(MachineState *machine) * containing the system memory at low priority; any secure-only * devices go in at higher priority and take precedence. */ - secure_sysmem = g_new(MemoryRegion, 1); + secure_sysmem = vms->secure_sysmem = g_new(MemoryRegion, 1); memory_region_init(secure_sysmem, OBJECT(machine), "secure-memory", UINT64_MAX); memory_region_add_subregion_overlap(secure_sysmem, 0, sysmem, -1); @@ -2103,7 +2615,7 @@ static void machvirt_init(MachineState *machine) */ if (vms->secure && firmware_loaded) { vms->psci_conduit = QEMU_PSCI_CONDUIT_DISABLED; - } else if (vms->virt) { + } else if (vms->virt || virtcca_cvm_enabled()) { vms->psci_conduit = QEMU_PSCI_CONDUIT_SMC; } else { vms->psci_conduit = QEMU_PSCI_CONDUIT_HVC; @@ -2155,107 +2667,129 @@ static void machvirt_init(MachineState *machine) exit(1); } - create_fdt(vms); + if (vms->mte) { + /* Create the memory region only once, but link to all cpus later */ + tag_sysmem = vms->tag_sysmem = g_new(MemoryRegion, 1); + memory_region_init(tag_sysmem, OBJECT(machine), + "tag-memory", UINT64_MAX / 32); - assert(possible_cpus->len == max_cpus); - for (n = 0; n < possible_cpus->len; n++) { - Object *cpuobj; - CPUState *cs; + if (vms->secure) { + secure_tag_sysmem = vms->secure_tag_sysmem = g_new(MemoryRegion, 1); + memory_region_init(secure_tag_sysmem, OBJECT(machine), + "secure-tag-memory", UINT64_MAX / 32); - if (n >= smp_cpus) { - break; + /* As with ram, secure-tag takes precedence over tag. */ + memory_region_add_subregion_overlap(secure_tag_sysmem, 0, + tag_sysmem, -1); } + } - cpuobj = object_new(possible_cpus->cpus[n].type); - object_property_set_int(cpuobj, "mp-affinity", - possible_cpus->cpus[n].arch_id, NULL); - - cs = CPU(cpuobj); - cs->cpu_index = n; - - numa_cpu_pre_plug(&possible_cpus->cpus[cs->cpu_index], DEVICE(cpuobj), - &error_fatal); + create_fdt(vms); - aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); + if (virtcca_cvm_enabled()) { + int kae_num = tmm_get_kae_num(); + fdt_add_all_hisi_nodes(vms, kae_num); - if (!vms->secure) { - object_property_set_bool(cpuobj, "has_el3", false, NULL); + int ret = kvm_arm_tmm_init(machine->cgs, &error_fatal); + if (ret != 0) { + error_report("fail to initialize TMM"); + exit(1); } + } - if (!vms->virt && object_property_find(cpuobj, "has_el2")) { - object_property_set_bool(cpuobj, "has_el2", false, NULL); - } + qemu_log("cpu init start\n"); - if (vmc->kvm_no_adjvtime && - object_property_find(cpuobj, "kvm-no-adjvtime")) { - object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL); - } + cpu_class = object_class_by_name(machine->cpu_type); + has_ged = has_ged && firmware_loaded && + virt_is_acpi_enabled(vms) && + !!object_class_dynamic_cast(cpu_class, TYPE_AARCH64_CPU); - if (vmc->no_kvm_steal_time && - object_property_find(cpuobj, "kvm-steal-time")) { - object_property_set_bool(cpuobj, "kvm-steal-time", false, NULL); - } + if (tcg_enabled() || hvf_enabled() || qtest_enabled() || + (kvm_enabled() && !kvm_smccc_filter_enabled()) || + (vms->gic_version < VIRT_GIC_VERSION_3) || !has_ged) { + vms->cpu_hotplug_enabled = false; + } else { + vms->cpu_hotplug_enabled = true; + } - if (vmc->no_pmu && object_property_find(cpuobj, "pmu")) { - object_property_set_bool(cpuobj, "pmu", false, NULL); + if (!vms->cpu_hotplug_enabled) { + if (machine->smp.max_cpus > smp_cpus) { + warn_report("cpu hotplug feature has been disabled"); } + } - if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) { - object_property_set_bool(cpuobj, "lpa2", false, NULL); - } + notifier_list_init(&vms->cpuhp_notifiers); + assert(possible_cpus->len == max_cpus); + for (n = 0; n < possible_cpus->len; n++) { + Object *cpuobj; + CPUState *cs; - if (object_property_find(cpuobj, "reset-cbar")) { - object_property_set_int(cpuobj, "reset-cbar", - vms->memmap[VIRT_CPUPERIPHS].base, - &error_abort); + if (!vms->cpu_hotplug_enabled && n >= smp_cpus) { + break; } - object_property_set_link(cpuobj, "memory", OBJECT(sysmem), - &error_abort); - if (vms->secure) { - object_property_set_link(cpuobj, "secure-memory", - OBJECT(secure_sysmem), &error_abort); - } - - if (vms->mte) { - /* Create the memory region only once, but link to all cpus. */ - if (!tag_sysmem) { - /* - * The property exists only if MemTag is supported. - * If it is, we must allocate the ram to back that up. - */ - if (!object_property_find(cpuobj, "tag-memory")) { - error_report("MTE requested, but not supported " - "by the guest CPU"); - exit(1); - } + cpuobj = object_new(possible_cpus->cpus[n].type); + cs = CPU(cpuobj); - tag_sysmem = g_new(MemoryRegion, 1); - memory_region_init(tag_sysmem, OBJECT(machine), - "tag-memory", UINT64_MAX / 32); + aarch64 &= object_property_get_bool(cpuobj, "aarch64", NULL); + object_property_set_int(cpuobj, "socket-id", + virt_get_socket_id(machine, n), NULL); + object_property_set_int(cpuobj, "cluster-id", + virt_get_cluster_id(machine, n), NULL); + object_property_set_int(cpuobj, "core-id", + virt_get_core_id(machine, n), NULL); + object_property_set_int(cpuobj, "thread-id", + virt_get_thread_id(machine, n), NULL); + + if (n < smp_cpus) { + qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); + object_unref(cpuobj); + } else { + CPUArchId *cpu_slot; - if (vms->secure) { - secure_tag_sysmem = g_new(MemoryRegion, 1); - memory_region_init(secure_tag_sysmem, OBJECT(machine), - "secure-tag-memory", UINT64_MAX / 32); + /* handling for vcpus which are yet to be hot-plugged */ + cs->cpu_index = n; + cpu_slot = virt_find_cpu_slot(machine, cs->cpu_index); - /* As with ram, secure-tag takes precedence over tag. */ - memory_region_add_subregion_overlap(secure_tag_sysmem, 0, - tag_sysmem, -1); - } - } + /* + * ARM host vCPU features need to be fixed at the boot time. But as + * per current approach this CPU object will be destroyed during + * cpu_post_init(). During hotplug of vCPUs these properties are + * initialized again. + */ + virt_cpu_set_properties(cpuobj, cpu_slot, &error_fatal); - object_property_set_link(cpuobj, "tag-memory", OBJECT(tag_sysmem), - &error_abort); - if (vms->secure) { - object_property_set_link(cpuobj, "secure-tag-memory", - OBJECT(secure_tag_sysmem), - &error_abort); + /* + * For KVM, we shall be pre-creating the now disabled/un-plugged + * possbile host vcpus and park them till the time they are + * actually hot plugged. This is required to pre-size the host + * GICC and GICR with the all possible vcpus for this VM. + */ + if (kvm_enabled()) { + kvm_arm_create_host_vcpu(ARM_CPU(cs)); } + /* + * Add disabled vCPU to CPU slot during the init phase of the virt + * machine + * 1. We need this ARMCPU object during the GIC init. This object + * will facilitate in pre-realizing the GIC. Any info like + * mp-affinity(required to derive gicr_type) etc. could still be + * fetched while preserving QOM abstraction akin to realized + * vCPUs. + * 2. Now, after initialization of the virt machine is complete we + * could use two approaches to deal with this ARMCPU object: + * (i) re-use this ARMCPU object during hotplug of this vCPU. + * OR + * (ii) defer release this ARMCPU object after gic has been + * initialized or during pre-plug phase when a vCPU is + * hotplugged. + * + * We will use the (ii) approach and release the ARMCPU objects + * after GIC and machine has been fully initialized during + * machine_init_done() phase. + */ + cpu_slot->cpu = OBJECT(cs); } - - qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); - object_unref(cpuobj); } fdt_add_timer_nodes(vms); fdt_add_cpu_nodes(vms); @@ -2267,12 +2801,18 @@ static void machvirt_init(MachineState *machine) create_gic(vms, sysmem); + if (has_ged) { + vms->acpi_dev = create_acpi_ged(vms); + } + virt_cpu_post_init(vms, sysmem); fdt_add_pmu_nodes(vms); create_uart(vms, VIRT_UART, sysmem, serial_hd(0)); + create_cpufreq(vms, sysmem); + if (vms->secure) { create_secure_ram(vms, secure_sysmem, secure_tag_sysmem); create_uart(vms, VIRT_SECURE_UART, secure_sysmem, serial_hd(1)); @@ -2289,9 +2829,7 @@ static void machvirt_init(MachineState *machine) create_pcie(vms); - if (has_ged && aarch64 && firmware_loaded && virt_is_acpi_enabled(vms)) { - vms->acpi_dev = create_acpi_ged(vms); - } else { + if (!has_ged) { create_gpio_devices(vms, VIRT_GPIO, sysmem); } @@ -2332,6 +2870,9 @@ static void machvirt_init(MachineState *machine) vms->bootinfo.get_dtb = machvirt_dtb; vms->bootinfo.skip_dtb_autoload = true; vms->bootinfo.firmware_loaded = firmware_loaded; + vms->bootinfo.firmware_base = vms->memmap[VIRT_FLASH].base; + vms->bootinfo.firmware_max_size = vms->memmap[VIRT_FLASH].size; + vms->bootinfo.confidential = virtcca_cvm_enabled(); vms->bootinfo.psci_conduit = vms->psci_conduit; arm_load_kernel(ARM_CPU(first_cpu), machine, &vms->bootinfo); @@ -2661,6 +3202,54 @@ static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) return socket_id % ms->numa_state->num_nodes; } +static int virt_get_socket_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.socket_id; +} + +static int virt_get_cluster_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.cluster_id; +} + +static int virt_get_core_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.core_id; +} + +static int virt_get_thread_id(const MachineState *ms, int cpu_index) +{ + assert(cpu_index >= 0 && cpu_index < ms->possible_cpus->len); + + return ms->possible_cpus->cpus[cpu_index].props.thread_id; +} + +static int +virt_get_cpu_id_from_cpu_topo(const MachineState *ms, DeviceState *dev) +{ + int cpu_id, sock_vcpu_num, clus_vcpu_num, core_vcpu_num; + ARMCPU *cpu = ARM_CPU(dev); + + /* calculate total logical cpus across socket/cluster/core */ + sock_vcpu_num = cpu->socket_id * (ms->smp.threads * ms->smp.cores * + ms->smp.clusters); + clus_vcpu_num = cpu->cluster_id * (ms->smp.threads * ms->smp.cores); + core_vcpu_num = cpu->core_id * ms->smp.threads; + + /* get vcpu-id(logical cpu index) for this vcpu from this topology */ + cpu_id = (sock_vcpu_num + clus_vcpu_num + core_vcpu_num) + cpu->thread_id; + + assert(cpu_id >= 0 && cpu_id < ms->possible_cpus->len); + + return cpu_id; +} + static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) { int n; @@ -2678,6 +3267,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { ms->possible_cpus->cpus[n].type = ms->cpu_type; + ms->possible_cpus->cpus[n].vcpus_count = 1; ms->possible_cpus->cpus[n].arch_id = virt_cpu_mp_affinity(vms, n); @@ -2698,6 +3288,50 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) return ms->possible_cpus; } +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int vcpuid) +{ + VirtMachineState *vms = VIRT_MACHINE(ms); + CPUArchId *found_cpu; + uint64_t mp_affinity; + + assert(vcpuid >= 0 && vcpuid < ms->possible_cpus->len); + + /* + * RFC: Question: + * TBD: Should mp-affinity be treated as MPIDR? + */ + mp_affinity = virt_cpu_mp_affinity(vms, vcpuid); + found_cpu = &ms->possible_cpus->cpus[vcpuid]; + + assert(found_cpu->arch_id == mp_affinity); + + /* + * RFC: Question: + * Slot-id is the index where vCPU with certain arch-id(=mpidr/ap-affinity) + * is plugged. For Host KVM, MPIDR for vCPU is derived using vcpu-id. + * As I understand, MPIDR and vcpu-id are property of vCPU but slot-id is + * more related to machine? Current code assumes slot-id and vcpu-id are + * same i.e. meaning of slot is bit vague. + * + * Q1: Is there any requirement to clearly represent slot and dissociate it + * from vcpu-id? + * Q2: Should we make MPIDR within host KVM user configurable? + * + * +----+----+----+----+----+----+----+----+ + * MPIDR ||| Res | Aff2 | Aff1 | Aff0 | + * +----+----+----+----+----+----+----+----+ + * \ \ \ | | + * \ 8bit \ 8bit \ |4bit| + * \<------->\<------->\ |<-->| + * \ \ \| | + * +----+----+----+----+----+----+----+----+ + * VCPU-ID | Byte4 | Byte2 | Byte1 | Byte0 | + * +----+----+----+----+----+----+----+----+ + */ + + return found_cpu; +} + static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2741,6 +3375,244 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev, dev, &error_abort); } +static void virt_update_gic(VirtMachineState *vms, CPUState *cs) +{ + GICv3CPUHotplugInfo gic_info = { .gic = vms->gic, .cpu = cs }; + + /* notify gic to stitch GICC to this new cpu */ + notifier_list_notify(&vms->cpuhp_notifiers, &gic_info); +} + +static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + ARMCPU *cpu = ARM_CPU(dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + + /* sanity check the cpu */ + if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { + error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", + ms->cpu_type); + return; + } + + if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) { + error_setg(errp, "Invalid thread-id %u specified, correct range 0:%u", + cpu->thread_id, ms->smp.threads - 1); + return; + } + + if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) { + error_setg(errp, "Invalid core-id %d specified, correct range 0:%u", + cpu->core_id, ms->smp.cores - 1); + return; + } + + if ((cpu->cluster_id < 0) || (cpu->cluster_id >= ms->smp.clusters)) { + error_setg(errp, "Invalid cluster-id %u specified, correct range 0:%u", + cpu->cluster_id, ms->smp.clusters - 1); + return; + } + + if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) { + error_setg(errp, "Invalid socket-id %u specified, correct range 0:%u", + cpu->socket_id, ms->smp.sockets - 1); + return; + } + + cs->cpu_index = virt_get_cpu_id_from_cpu_topo(ms, dev); + + /* Except for cold-booted vCPUs, this should check presence of ACPI GED */ + if (cs->cpu_index >= ms->smp.cpus && !vms->acpi_dev) { + error_setg(errp, "GED acpi device does not exists"); + return; + } + + if (cs->cpu_index >= ms->smp.cpus && !vms->cpu_hotplug_enabled) { + error_setg(errp, "CPU [cold|hot]plug not supported on this machine"); + return; + } + + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + if (qemu_present_cpu(CPU(cpu_slot->cpu))) { + error_setg(errp, "cpu(id%d=%d:%d:%d:%d) with arch-id %" PRIu64 " exist", + cs->cpu_index, cpu->socket_id, cpu->cluster_id, cpu->core_id, + cpu->thread_id, cpu_slot->arch_id); + return; + } + virt_cpu_set_properties(OBJECT(cs), cpu_slot, errp); + + /* + * Fix the GIC for this new vCPU being plugged. The QOM CPU object for the + * new vCPU need to be updated in the corresponding QOM GICv3CPUState object + * We also need to re-wire the IRQs for this new CPU object. This update + * is limited to the QOM only and does not affects the KVM. Later has + * already been pre-sized with possible CPU at VM init time. This is a + * workaround to the constraints posed by ARM architecture w.r.t supporting + * CPU Hotplug. Specification does not exist for the later. + * This patch-up is required both for {cold,hot}-plugged vCPUs. Cold-inited + * vCPUs have their GIC state initialized during machvit_init(). + */ + if (vms->acpi_dev) { + virt_update_gic(vms, cs); + wire_gic_cpu_irqs(vms, cs); + } + + /* + * To give persistent presence view of vCPUs to the guest, ACPI might need + * to fake the presence of the vCPUs to the guest but keep them disabled. + * This shall be used during the init of ACPI Hotplug state and hot-unplug + */ + cs->acpi_persistent = true; + + if (!dev->hotplugged) { + cs->cold_booted = true; + } +#ifdef CONFIG_KVM + if (cs->cpu_index >= ms->smp.cpus) { + cpu->kvm_sve_finalized = true; + } +#endif +} + +static void virt_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + CPUState *cs = CPU(dev); + Error *local_err = NULL; + CPUArchId *cpu_slot; + + /* insert the cold/hot-plugged vcpu in the slot */ + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + cpu_slot->cpu = OBJECT(dev); + + /* + * Update the ACPI Hotplug state both for vCPUs being {hot,cold}-plugged. + * vCPUs can be cold-plugged using '-device' option. For vCPUs being hot + * plugged, guest is also notified. + */ + if (vms->acpi_dev) { + HotplugHandlerClass *hhc; + /* update acpi hotplug state and send cpu hotplug event to guest */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } + /* register this cpu for reset & update F/W info for the next boot */ + qemu_register_reset(do_cpu_reset, ARM_CPU(cs)); + } + + vms->boot_cpus++; + if (vms->fw_cfg) { + fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); + } + + cs->disabled = false; + return; +fail: + error_propagate(errp, local_err); +} + +static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + HotplugHandlerClass *hhc; + ARMCPU *cpu = ARM_CPU(dev); + CPUState *cs = CPU(dev); + Error *local_err = NULL; + + if (!vms->acpi_dev || !dev->realized) { + error_setg(errp, "GED does not exists or device is not realized!"); + return; + } + + if (!vms->cpu_hotplug_enabled) { + error_setg(errp, "CPU hot(un)plug not supported on this machine"); + return; + } + + /* + * UEFI ACPI standard change is required to make both 'enabled' and the + * 'online-capable' bit co-exist instead of being mutually exclusive. + * check virt_acpi_get_gicc_flags() for more details. + * + * Disable the unplugging of cold-booted vCPUs as a temporary mitigation. + */ + if (cs->cold_booted) { + error_setg(errp, "Hot-unplug of cold-booted CPU not supported!"); + return; + } + + if (cs->cpu_index == first_cpu->cpu_index) { + error_setg(errp, "Boot CPU(id%d=%d:%d:%d:%d) hot-unplug not supported", + first_cpu->cpu_index, cpu->socket_id, cpu->cluster_id, + cpu->core_id, cpu->thread_id); + return; + } + + /* request cpu hotplug from guest */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->unplug_request(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } + + return; +fail: + error_propagate(errp, local_err); +} + +static void virt_cpu_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineState *ms = MACHINE(hotplug_dev); + HotplugHandlerClass *hhc; + CPUState *cs = CPU(dev); + Error *local_err = NULL; + CPUArchId *cpu_slot; + + if (!vms->acpi_dev || !dev->realized) { + error_setg(errp, "GED does not exists or device is not realized!"); + return; + } + + cpu_slot = virt_find_cpu_slot(ms, cs->cpu_index); + + /* update the acpi cpu hotplug state for cpu hot-unplug */ + hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); + hhc->unplug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &local_err); + if (local_err) { + goto fail; + } + + unwire_gic_cpu_irqs(vms, cs); + virt_update_gic(vms, cs); + + qemu_unregister_reset(do_cpu_reset, ARM_CPU(cs)); + vms->boot_cpus--; + if (vms->fw_cfg) { + fw_cfg_modify_i16(vms->fw_cfg, FW_CFG_NB_CPUS, vms->boot_cpus); + } + + qobject_unref(dev->opts); + dev->opts = NULL; + + cpu_slot->cpu = NULL; + cs->disabled = true; + + return; +fail: + error_propagate(errp, local_err); +} + static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2783,6 +3655,8 @@ static void virt_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, qlist_append_str(reserved_regions, resv_prop_str); qdev_prop_set_array(dev, "reserved-regions", reserved_regions); g_free(resv_prop_str); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_pre_plug(hotplug_dev, dev, errp); } } @@ -2790,10 +3664,34 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(vms); - if (vms->platform_bus_dev) { - MachineClass *mc = MACHINE_GET_CLASS(vms); + /* For smmuv3-nested devices we need to set the mem & irq */ + if (device_is_dynamic_sysbus(mc, dev) && + object_dynamic_cast(OBJECT(dev), TYPE_ARM_SMMUV3_ACCEL)) { + hwaddr base = vms->memmap[VIRT_SMMU_ACCEL].base; + int irq = vms->irqmap[VIRT_SMMU_ACCEL]; + + if (vms->smmu_accel_count >= MAX_SMMU_ACCEL) { + error_setg(errp, "smmuv3-nested max count reached!"); + return; + } + + base += (vms->smmu_accel_count * SMMU_IO_LEN); + irq += (vms->smmu_accel_count * NUM_SMMU_IRQS); + + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); + for (int i = 0; i < 4; i++) { + sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, + qdev_get_gpio_in(vms->gic, irq + i)); + } + if (vms->iommu != VIRT_IOMMU_SMMUV3_ACCEL) { + vms->iommu = VIRT_IOMMU_SMMUV3_ACCEL; + } + vms->smmu_accel_count++; + } + if (vms->platform_bus_dev) { if (device_is_dynamic_sysbus(mc, dev)) { platform_bus_link_device(PLATFORM_BUS_DEVICE(vms->platform_bus_dev), SYS_BUS_DEVICE(dev)); @@ -2804,6 +3702,8 @@ static void virt_machine_device_plug_cb(HotplugHandler *hotplug_dev, virt_memory_plug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_plug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_plug(hotplug_dev, dev, errp); } if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { @@ -2861,6 +3761,8 @@ static void virt_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_unplug_request(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_unplug_request(hotplug_dev, dev, errp); } else { error_setg(errp, "device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -2874,6 +3776,8 @@ static void virt_machine_device_unplug_cb(HotplugHandler *hotplug_dev, virt_dimm_unplug(hotplug_dev, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI)) { virtio_md_pci_unplug(VIRTIO_MD_PCI(dev), MACHINE(hotplug_dev), errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + virt_cpu_unplug(hotplug_dev, dev, errp); } else { error_setg(errp, "virt: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -2888,7 +3792,8 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, if (device_is_dynamic_sysbus(mc, dev) || object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) || object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MD_PCI) || - object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) { + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || + object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { return HOTPLUG_HANDLER(machine); } return NULL; @@ -2901,6 +3806,15 @@ static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, static int virt_kvm_type(MachineState *ms, const char *type_str) { VirtMachineState *vms = VIRT_MACHINE(ms); + int virtcca_cvm_type = 0; + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", &error_abort); + + if (!strcmp(kvm_type, "cvm")) { + virtcca_cvm_type = VIRTCCA_CVM_TYPE; + } + } int max_vm_pa_size, requested_pa_size; bool fixed_ipa; @@ -2930,7 +3844,9 @@ static int virt_kvm_type(MachineState *ms, const char *type_str) * the implicit legacy 40b IPA setting, in which case the kvm_type * must be 0. */ - return fixed_ipa ? 0 : requested_pa_size; + return strcmp(type_str, "cvm") == 0 ? + ((fixed_ipa ? 0 : requested_pa_size) | virtcca_cvm_type) : + (fixed_ipa ? 0 : requested_pa_size); } static void virt_machine_class_init(ObjectClass *oc, void *data) @@ -2948,6 +3864,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ARM_SMMUV3_ACCEL); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif @@ -2965,6 +3882,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) #endif mc->get_default_cpu_node_id = virt_get_default_cpu_node_id; mc->kvm_type = virt_kvm_type; + mc->has_hotpluggable_cpus = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = virt_machine_get_hotplug_handler; hc->pre_plug = virt_machine_device_pre_plug_cb; @@ -3101,6 +4019,19 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) } +static char *virt_get_kvm_type(Object *obj, Error **errp G_GNUC_UNUSED) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + return g_strdup(vms->kvm_type); +} + +static void virt_set_kvm_type(Object *obj, const char *value, Error **errp G_GNUC_UNUSED) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + g_free(vms->kvm_type); + vms->kvm_type = g_strdup(value); +} + static void virt_instance_init(Object *obj) { VirtMachineState *vms = VIRT_MACHINE(obj); @@ -3115,6 +4046,9 @@ static void virt_instance_init(Object *obj) /* EL2 is also disabled by default, for similar reasons */ vms->virt = false; + /* CPU hotplug is enabled by default */ + vms->cpu_hotplug_enabled = true; + /* High memory is enabled by default */ vms->highmem = true; vms->highmem_compact = !vmc->no_highmem_compact; @@ -3158,6 +4092,9 @@ static void virt_instance_init(Object *obj) vms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); vms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + + object_property_add_str(obj, "kvm-type", virt_get_kvm_type, virt_set_kvm_type); + object_property_set_description(obj, "kvm-type", "CVM or Normal VM"); } static const TypeInfo virt_machine_info = { diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c index 3aa105748d36c1b8efa3c1bd2bc81a353365b7d5..88dfd0bb7fb31d71061c239140c747a5a133f336 100644 --- a/hw/audio/cs4231a.c +++ b/hw/audio/cs4231a.c @@ -682,6 +682,11 @@ static void cs4231a_realizefn (DeviceState *dev, Error **errp) return; } + if (s->irq >= ISA_NUM_IRQS) { + error_setg(errp, "Invalid IRQ %d (max %d)", s->irq, ISA_NUM_IRQS - 1); + return; + } + s->pic = isa_bus_get_irq(bus, s->irq); k = ISADMA_GET_CLASS(s->isa_dma); k->register_channel(s->isa_dma, s->dma, cs_dma_read, s); diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c index 0bc20d49f6cf43ea37141d4543c2d79d8e2bd68a..ac908e56c6bbc6b17a23ae6f6fac499fc7af213b 100644 --- a/hw/audio/hda-codec.c +++ b/hw/audio/hda-codec.c @@ -487,8 +487,7 @@ static void hda_audio_setup(HDAAudioStream *st) if (st->output) { if (use_timer) { cb = hda_audio_output_cb; - st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, - hda_audio_output_timer, st); + timer_del(st->buft); } else { cb = hda_audio_compat_output_cb; } @@ -497,8 +496,7 @@ static void hda_audio_setup(HDAAudioStream *st) } else { if (use_timer) { cb = hda_audio_input_cb; - st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, - hda_audio_input_timer, st); + timer_del(st->buft); } else { cb = hda_audio_compat_input_cb; } @@ -726,8 +724,12 @@ static void hda_audio_init(HDACodecDevice *hda, st->gain_right = QEMU_HDA_AMP_STEPS; st->compat_bpos = sizeof(st->compat_buf); st->output = true; + st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, + hda_audio_output_timer, st); } else { st->output = false; + st->buft = timer_new_ns(QEMU_CLOCK_VIRTUAL, + hda_audio_input_timer, st); } st->format = AC_FMT_TYPE_PCM | AC_FMT_BITS_16 | (1 << AC_FMT_CHAN_SHIFT); @@ -750,9 +752,7 @@ static void hda_audio_exit(HDACodecDevice *hda) if (st->node == NULL) { continue; } - if (a->use_timer) { - timer_del(st->buft); - } + timer_free(st->buft); if (st->output) { AUD_close_out(&a->card, st->voice.out); } else { diff --git a/hw/audio/trace-events b/hw/audio/trace-events index b1870ff224b3c1677d2ec976e981b3495ec3eb9d..b8ef5727678f6dcf8c4d1341f38ea519b946a3f5 100644 --- a/hw/audio/trace-events +++ b/hw/audio/trace-events @@ -41,7 +41,6 @@ asc_update_irq(int irq, int a, int b) "set IRQ to %d (A: 0x%x B: 0x%x)" #virtio-snd.c virtio_snd_get_config(void *vdev, uint32_t jacks, uint32_t streams, uint32_t chmaps) "snd %p: get_config jacks=%"PRIu32" streams=%"PRIu32" chmaps=%"PRIu32"" -virtio_snd_set_config(void *vdev, uint32_t jacks, uint32_t new_jacks, uint32_t streams, uint32_t new_streams, uint32_t chmaps, uint32_t new_chmaps) "snd %p: set_config jacks from %"PRIu32"->%"PRIu32", streams from %"PRIu32"->%"PRIu32", chmaps from %"PRIu32"->%"PRIu32 virtio_snd_get_features(void *vdev, uint64_t features) "snd %p: get_features 0x%"PRIx64 virtio_snd_vm_state_running(void) "vm state running" virtio_snd_vm_state_stopped(void) "vm state stopped" diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 137fa77a01ce85803962f3027743353b991f179e..9f7a69e408a3cee3782e1762862006114386b972 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -107,29 +107,6 @@ virtio_snd_get_config(VirtIODevice *vdev, uint8_t *config) } -static void -virtio_snd_set_config(VirtIODevice *vdev, const uint8_t *config) -{ - VirtIOSound *s = VIRTIO_SND(vdev); - const virtio_snd_config *sndconfig = - (const virtio_snd_config *)config; - - - trace_virtio_snd_set_config(vdev, - s->snd_conf.jacks, - sndconfig->jacks, - s->snd_conf.streams, - sndconfig->streams, - s->snd_conf.chmaps, - sndconfig->chmaps); - - memcpy(&s->snd_conf, sndconfig, sizeof(virtio_snd_config)); - le32_to_cpus(&s->snd_conf.jacks); - le32_to_cpus(&s->snd_conf.streams); - le32_to_cpus(&s->snd_conf.chmaps); - -} - static void virtio_snd_pcm_buffer_free(VirtIOSoundPCMBuffer *buffer) { @@ -400,7 +377,7 @@ static void virtio_snd_get_qemu_audsettings(audsettings *as, as->nchannels = MIN(AUDIO_MAX_CHANNELS, params->channels); as->fmt = virtio_snd_get_qemu_format(params->format); as->freq = virtio_snd_get_qemu_freq(params->rate); - as->endianness = target_words_bigendian() ? 1 : 0; + as->endianness = 0; /* Conforming to VIRTIO 1.0: always little endian. */ } /* @@ -1274,7 +1251,7 @@ static void virtio_snd_pcm_in_cb(void *data, int available) { VirtIOSoundPCMStream *stream = data; VirtIOSoundPCMBuffer *buffer; - size_t size; + size_t size, max_size; WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) { while (!QSIMPLEQ_EMPTY(&stream->queue)) { @@ -1288,7 +1265,12 @@ static void virtio_snd_pcm_in_cb(void *data, int available) continue; } + max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num); for (;;) { + if (buffer->size >= max_size) { + return_rx_buffer(stream, buffer); + break; + } size = AUD_read(stream->voice.in, buffer->data + buffer->size, MIN(available, (stream->params.period_bytes - @@ -1399,7 +1381,6 @@ static void virtio_snd_class_init(ObjectClass *klass, void *data) vdc->realize = virtio_snd_realize; vdc->unrealize = virtio_snd_unrealize; vdc->get_config = virtio_snd_get_config; - vdc->set_config = virtio_snd_set_config; vdc->get_features = get_features; vdc->reset = virtio_snd_reset; vdc->legacy_features = 0; diff --git a/hw/block/block.c b/hw/block/block.c index 9f52ee6e728e34f8779a30e2afe7628b29e2dde9..6bece8770917382585ab0447e4bc5ed8df042c30 100644 --- a/hw/block/block.c +++ b/hw/block/block.c @@ -239,6 +239,16 @@ bool blkconf_apply_backend_options(BlockConf *conf, bool readonly, blk_set_enable_write_cache(blk, wce); blk_set_on_error(blk, rerror, werror); + if (rerror == BLOCKDEV_ON_ERROR_RETRY || + werror == BLOCKDEV_ON_ERROR_RETRY) { + if (conf->retry_interval >= 0) { + blk_set_on_error_retry_interval(blk, conf->retry_interval); + } + if (conf->retry_timeout >= 0) { + blk_set_on_error_retry_timeout(blk, conf->retry_timeout); + } + } + block_acct_setup(blk_get_stats(blk), conf->account_invalid, conf->account_failed); return true; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index a1f8e155227591fb428f065d11658f133e348a6b..beedc0cf5f859e3a1aae094ab1ad5b5bac1a3f0d 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -90,6 +90,10 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, block_acct_failed(blk_get_stats(s->blk), &req->acct); } virtio_blk_free_request(req); + } else if (action == BLOCK_ERROR_ACTION_RETRY) { + req->mr_next = NULL; + req->next = s->rq; + s->rq = req; } blk_error_action(s->blk, action, is_read, error); @@ -131,6 +135,7 @@ static void virtio_blk_rw_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); block_acct_done(blk_get_stats(s->blk), &req->acct); virtio_blk_free_request(req); @@ -150,6 +155,7 @@ static void virtio_blk_flush_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); block_acct_done(blk_get_stats(s->blk), &req->acct); virtio_blk_free_request(req); @@ -172,6 +178,7 @@ static void virtio_blk_discard_write_zeroes_complete(void *opaque, int ret) } } + blk_error_retry_reset_timeout(s->blk); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); if (is_write_zeroes) { block_acct_done(blk_get_stats(s->blk), &req->acct); @@ -783,7 +790,8 @@ static void virtio_blk_handle_zone_report(VirtIOBlockReq *req, sizeof(struct virtio_blk_zone_report) + sizeof(struct virtio_blk_zone_descriptor)) { virtio_error(vdev, "in buffer too small for zone report"); - return; + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; } /* start byte offset of the zone report */ @@ -852,7 +860,7 @@ static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) } else { if (bs->bl.zone_size > capacity - offset) { /* The zoned device allows the last smaller zone. */ - len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1); + len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1ull); } else { len = bs->bl.zone_size; } @@ -1183,12 +1191,12 @@ static void virtio_blk_dma_restart_bh(void *opaque) { VirtIOBlock *s = opaque; - VirtIOBlockReq *req = s->rq; + VirtIOBlockReq *req; MultiReqBuffer mrb = {}; - s->rq = NULL; - aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + req = s->rq; + s->rq = NULL; while (req) { VirtIOBlockReq *next = req->next; if (virtio_blk_handle_request(req, &mrb)) { @@ -1541,10 +1549,44 @@ static void virtio_blk_drained_end(void *opaque) } } +static void virtio_blk_retry_request(void *opaque) +{ + VirtIOBlock *s = VIRTIO_BLK(opaque); + + VirtIOBlockReq *req; + MultiReqBuffer mrb = {}; + + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + req = s->rq; + s->rq = NULL; + while (req) { + VirtIOBlockReq *next = req->next; + if (virtio_blk_handle_request(req, &mrb)) { + /* Device is now broken and won't do any processing until it gets + * reset. Already queued requests will be lost: let's purge them. + */ + while (req) { + next = req->next; + virtqueue_detach_element(req->vq, &req->elem, 0); + virtio_blk_free_request(req); + req = next; + } + break; + } + req = next; + } + + if (mrb.num_reqs) { + virtio_blk_submit_multireq(s, &mrb); + } + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); +} + static const BlockDevOps virtio_block_ops = { .resize_cb = virtio_blk_resize, .drained_begin = virtio_blk_drained_begin, .drained_end = virtio_blk_drained_end, + .retry_request_cb = virtio_blk_retry_request, }; static void virtio_blk_device_realize(DeviceState *dev, Error **errp) diff --git a/hw/char/Kconfig b/hw/char/Kconfig index 6b6cf2fc1dff949b1f454308391d74953624a207..335a60c2c14c5e248aed1377daf6f2e4991bd96c 100644 --- a/hw/char/Kconfig +++ b/hw/char/Kconfig @@ -71,3 +71,7 @@ config GOLDFISH_TTY config SHAKTI_UART bool + +config CPUFREQ + bool + default y diff --git a/hw/char/pl011.c b/hw/char/pl011.c index 58edeb9ddb6b39ead1d47d7bb74a1f5f04f4fd18..bc65d778d23f8ae9c2d52d9244858e052ebc3401 100644 --- a/hw/char/pl011.c +++ b/hw/char/pl011.c @@ -314,6 +314,10 @@ static void pl011_write(void *opaque, hwaddr offset, case 17: /* UARTICR */ s->int_level &= ~value; pl011_update(s); + if (!s->int_enabled && !s->int_level) { + s->read_count = 0; + s->read_pos = 0; + } break; case 18: /* UARTDMACR */ s->dmacr = value; diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index dd619f0731e8ef7f0f8e8ff647131a998918bc4b..096214b11b183e6ed420d46d5a293388778769c3 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -257,6 +257,8 @@ static size_t send_control_event(VirtIOSerial *vser, uint32_t port_id, virtio_stw_p(vdev, &cpkt.value, value); trace_virtio_serial_send_control_event(port_id, event, value); + qemu_log("virtio serial port %d send control message" + " event = %d, value = %d\n", port_id, event, value); return send_control_msg(vser, &cpkt, sizeof(cpkt)); } @@ -364,6 +366,9 @@ static void handle_control_message(VirtIOSerial *vser, void *buf, size_t len) cpkt.value = virtio_lduw_p(vdev, &gcpkt->value); trace_virtio_serial_handle_control_message(cpkt.event, cpkt.value); + qemu_log("virtio serial port '%u' handle control message" + " event = %d, value = %d\n", + virtio_ldl_p(vdev, &gcpkt->id), cpkt.event, cpkt.value); if (cpkt.event == VIRTIO_CONSOLE_DEVICE_READY) { if (!cpkt.value) { @@ -985,8 +990,7 @@ static void virtser_port_device_realize(DeviceState *dev, Error **errp) return; } - port->bh = qemu_bh_new_guarded(flush_queued_data_bh, port, - &dev->mem_reentrancy_guard); + port->bh = virtio_bh_new_guarded(dev, flush_queued_data_bh, port); port->elem = NULL; } diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index 82dae51a550b52b836fe9802784e6d50fb71c337..e36ca2c207093a41ff4554dc41941fe424edad25 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -262,6 +262,10 @@ static void cpu_common_finalize(Object *obj) { CPUState *cpu = CPU(obj); + /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */ + if (cpu->gdb_regs) { + g_array_free(cpu->gdb_regs, TRUE); + } qemu_lockcnt_destroy(&cpu->in_ioctl_lock); qemu_mutex_destroy(&cpu->work_mutex); } diff --git a/hw/core/gpio.c b/hw/core/gpio.c index 80d07a6ec99ade02cbf69ed4b2f691e04eef03d7..abb164d5c0e7d10c02ea0002e125d3a35bca5da6 100644 --- a/hw/core/gpio.c +++ b/hw/core/gpio.c @@ -143,7 +143,7 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n) /* disconnect a GPIO output, returning the disconnected input (if any) */ -static qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, +qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, const char *name, int n) { char *propname = g_strdup_printf("%s[%d]", diff --git a/hw/core/irq.c b/hw/core/irq.c index 3f14e2dda74709f35014a37d5997f0c646b4f240..df9b5dac9be4fffb545286aaf2ab3d41da1b42df 100644 --- a/hw/core/irq.c +++ b/hw/core/irq.c @@ -110,7 +110,10 @@ void qemu_irq_intercept_in(qemu_irq *gpio_in, qemu_irq_handler handler, int n) int i; qemu_irq *old_irqs = qemu_allocate_irqs(NULL, NULL, n); for (i = 0; i < n; i++) { - *old_irqs[i] = *gpio_in[i]; + old_irqs[i]->handler = gpio_in[i]->handler; + old_irqs[i]->opaque = gpio_in[i]->opaque; + old_irqs[i]->n = gpio_in[i]->n; + gpio_in[i]->handler = handler; gpio_in[i]->opaque = &old_irqs[i]; } diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c index 3860a50c3b7bad1f496f2de980624e824e58dec7..f1389ef644a91c7f2eab7571372f18d8cc5194ed 100644 --- a/hw/core/machine-qmp-cmds.c +++ b/hw/core/machine-qmp-cmds.c @@ -8,6 +8,7 @@ */ #include "qemu/osdep.h" +#include "sysemu/rtc.h" #include "hw/acpi/vmgenid.h" #include "hw/boards.h" #include "hw/intc/intc.h" @@ -373,6 +374,11 @@ HumanReadableText *qmp_x_query_irq(Error **errp) return human_readable_text_from_str(buf); } +int64_t qmp_query_rtc_date_diff(Error **errp) +{ + return get_rtc_date_diff(); +} + GuidInfo *qmp_query_vm_generation_id(Error **errp) { GuidInfo *info; diff --git a/hw/core/machine.c b/hw/core/machine.c index 0c1739814124ca1045a383a1f3b9e791221a0bd6..965682619bdf23d58c33cab7c965ee3225c23ca8 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -160,6 +160,8 @@ const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0); GlobalProperty hw_compat_3_1[] = { { "pcie-root-port", "x-speed", "2_5" }, { "pcie-root-port", "x-width", "1" }, + { "pcie-root-port", "fast-plug", "0" }, + { "pcie-root-port", "fast-unplug", "0" }, { "memory-backend-file", "x-use-canonical-path-for-ramblock-id", "true" }, { "memory-backend-memfd", "x-use-canonical-path-for-ramblock-id", "true" }, { "tpm-crb", "ppi", "false" }, diff --git a/hw/core/numa.c b/hw/core/numa.c index f08956ddb0ff794833213b1bbe98c7702871743d..98d896e68704465a7785d90ee67f0318060d41ac 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -42,6 +42,8 @@ #include "qemu/option.h" #include "qemu/config-file.h" #include "qemu/cutils.h" +#include "exec/address-spaces.h" +#include "sysemu/kvm.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -641,6 +643,21 @@ static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram) } } +/* + * Add virtcca_shared_hugepage as a sub-MR to the root MR of address space + * address_space_memory and address_space_virtcca_shared_memory. + */ +static void virtcca_shared_memory_configuration(MachineState *ms) +{ + MemoryRegion *alias_mr = g_new(MemoryRegion, 1); + + memory_region_add_subregion_overlap(ms->ram, 0, virtcca_shared_hugepage, 1); + memory_region_init_alias(alias_mr, NULL, "alias-mr", virtcca_shared_hugepage, + 0, int128_get64(virtcca_shared_hugepage->size)); + memory_region_add_subregion(address_space_virtcca_shared_memory.root, + virtcca_cvm_gpa_start, alias_mr); +} + void numa_complete_configuration(MachineState *ms) { int i; @@ -711,6 +728,10 @@ void numa_complete_configuration(MachineState *ms) memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id, ms->ram_size); numa_init_memdev_container(ms, ms->ram); + if (virtcca_cvm_enabled() && virtcca_shared_hugepage && + virtcca_shared_hugepage->ram_block) { + virtcca_shared_memory_configuration(ms); + } } /* QEMU needs at least all unique node pair distances to build * the whole NUMA distance table. QEMU treats the distance table diff --git a/hw/core/platform-bus.c b/hw/core/platform-bus.c index b8487b26b674e46e445749118abc7b2cc6918f0f..dc58bf505aa2f5c0460eed9b41d9ee41d92b28f1 100644 --- a/hw/core/platform-bus.c +++ b/hw/core/platform-bus.c @@ -145,9 +145,12 @@ static void platform_bus_map_mmio(PlatformBusDevice *pbus, SysBusDevice *sbdev, * the target device's memory region */ for (off = 0; off < pbus->mmio_size; off += alignment) { - if (!memory_region_find(&pbus->mmio, off, size).mr) { + MemoryRegion *mr = memory_region_find(&pbus->mmio, off, size).mr; + if (!mr) { found_region = true; break; + } else { + memory_region_unref(mr); } } diff --git a/hw/core/ptimer.c b/hw/core/ptimer.c index e03165febf149dcdb25b1a4bb5c39a0a8fcd5869..7177ecfab0dbbb2e6ea2f4f60d371b00cff9a9af 100644 --- a/hw/core/ptimer.c +++ b/hw/core/ptimer.c @@ -83,7 +83,7 @@ static void ptimer_reload(ptimer_state *s, int delta_adjust) delta = s->delta = s->limit; } - if (s->period == 0) { + if (s->period == 0 && s->period_frac == 0) { if (!qtest_enabled()) { fprintf(stderr, "Timer with period zero, disabling\n"); } @@ -309,7 +309,7 @@ void ptimer_run(ptimer_state *s, int oneshot) assert(s->in_transaction); - if (was_disabled && s->period == 0) { + if (was_disabled && s->period == 0 && s->period_frac == 0) { if (!qtest_enabled()) { fprintf(stderr, "Timer with period zero, disabling\n"); } diff --git a/hw/core/qdev-prop-internal.h b/hw/core/qdev-prop-internal.h index d7b77844fe0b2fa14ee8db6728b4ce9fb005e4e5..68b1b9d10c86ab4195c245e7708442de7675e179 100644 --- a/hw/core/qdev-prop-internal.h +++ b/hw/core/qdev-prop-internal.h @@ -22,6 +22,8 @@ void qdev_propinfo_set_default_value_uint(ObjectProperty *op, void qdev_propinfo_get_int32(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp); +void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp); void qdev_propinfo_get_size32(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp); diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 1473ab3d5e956796fd445ad0d109a9619bfd93b8..9cc2e38aba27e0cf5d33a599ccc9e21c2f4245dc 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -635,6 +635,51 @@ const PropertyInfo qdev_prop_blockdev_on_error = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +static void set_retry_time(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + DeviceState *dev = DEVICE(obj); + Property *prop = opaque; + int64_t value, *ptr = object_field_prop_ptr(obj, prop); + Error *local_err = NULL; + + if (dev->realized) { + qdev_prop_set_after_realize(dev, name, errp); + return; + } + + visit_type_int64(v, name, &value, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + /* value should not be negative */ + if (value < 0) { + error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE, + dev->id ? : "", name, (int64_t)value, 0L, LONG_MAX); + return; + } + + *ptr = value; +} + +const PropertyInfo qdev_prop_blockdev_retry_interval = { + .name = "BlockdevRetryInterval", + .description = "Interval for retry error handling policy", + .get = qdev_propinfo_get_int64, + .set = set_retry_time, + .set_default_value = qdev_propinfo_set_default_value_int, +}; + +const PropertyInfo qdev_prop_blockdev_retry_timeout = { + .name = "BlockdevRetryTimeout", + .description = "Timeout for retry error handling policy", + .get = qdev_propinfo_get_int64, + .set = set_retry_time, + .set_default_value = qdev_propinfo_set_default_value_int, +}; + /* --- BIOS CHS translation */ QEMU_BUILD_BUG_ON(sizeof(BiosAtaTranslation) != sizeof(int)); @@ -666,7 +711,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd", + "none/zlib/zstd/qpl/uadk/qatzip", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, @@ -687,6 +732,16 @@ const PropertyInfo qdev_prop_mig_mode = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " + "none,legacy,multifd", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- Reserved Region --- */ /* @@ -1157,6 +1212,17 @@ const PropertyInfo qdev_prop_uuid = { .set_default_value = set_default_uuid_auto, }; +/* --- CompressMethod --- */ +const PropertyInfo qdev_prop_compress_method = { + .name = "CompressMethod", + .description = "multi-thread compression method, " + "zlib/zstd", + .enum_table = &CompressMethod_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- s390 cpu entitlement policy --- */ QEMU_BUILD_BUG_ON(sizeof(CpuS390Entitlement) != sizeof(int)); diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index 840006e953c077c4f00bca8c90b1ede16fd3daf5..19b7450b4db19459f9bb1e00986e0a38afd254a1 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -398,7 +398,7 @@ static void set_uint64(Object *obj, Visitor *v, const char *name, visit_type_uint64(v, name, ptr, errp); } -static void get_int64(Object *obj, Visitor *v, const char *name, +void qdev_propinfo_get_int64(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { Property *prop = opaque; @@ -425,7 +425,7 @@ const PropertyInfo qdev_prop_uint64 = { const PropertyInfo qdev_prop_int64 = { .name = "int64", - .get = get_int64, + .get = qdev_propinfo_get_int64, .set = set_int64, .set_default_value = qdev_propinfo_set_default_value_int, }; diff --git a/hw/core/reset.c b/hw/core/reset.c index d3263b613e661eeb7605d2e63ad2321c7838df46..fa63bfedb7083188a5e798dd35534b8635b61738 100644 --- a/hw/core/reset.c +++ b/hw/core/reset.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qemu/queue.h" +#include "qemu/log.h" #include "sysemu/reset.h" /* reset/shutdown handler */ @@ -75,6 +76,7 @@ void qemu_devices_reset(ShutdownCause reason) { QEMUResetEntry *re, *nre; + qemu_log("reset all devices\n"); /* reset all devices */ QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) { if (reason == SHUTDOWN_CAUSE_SNAPSHOT_LOAD && diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c index eebcd28f9a377178022016399dcc3ce4cc12854b..58f4dc614c2ccedf475d7a92e4eff8982b232732 100644 --- a/hw/core/sysbus-fdt.c +++ b/hw/core/sysbus-fdt.c @@ -489,6 +489,7 @@ static const BindingEntry bindings[] = { #ifdef CONFIG_LINUX TYPE_BINDING(TYPE_VFIO_CALXEDA_XGMAC, add_calxeda_midway_xgmac_fdt_node), TYPE_BINDING(TYPE_VFIO_AMD_XGBE, add_amd_xgbe_fdt_node), + TYPE_BINDING("arm-smmuv3-accel", no_fdt_node), VFIO_PLATFORM_BINDING("amd,xgbe-seattle-v1a", add_amd_xgbe_fdt_node), #endif #ifdef CONFIG_TPM diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index 2aa776c79c7465198328046d32072c58110c7fc7..c5f5fcfd64d00d400e8d3640afb1bbe0de7bb49b 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -26,6 +26,7 @@ static void cxl_fixed_memory_window_config(CXLState *cxl_state, CXLFixedMemoryWindowOptions *object, Error **errp) { + ERRP_GUARD(); g_autofree CXLFixedWindow *fw = g_malloc0(sizeof(*fw)); strList *target; int i; diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 6eff56fb1b345d1fd440f18c6b39541186157ac2..68d6457becde027f73f3bc9c5e2dd2a9b0740a1d 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -505,6 +505,9 @@ static CXLRetCode cmd_get_physical_port_state(const struct cxl_cmd *cmd, in = (struct cxl_fmapi_get_phys_port_state_req_pl *)payload_in; out = (struct cxl_fmapi_get_phys_port_state_resp_pl *)payload_out; + if (len_in < sizeof(*in)) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; + } /* Check if what was requested can fit */ if (sizeof(*out) + sizeof(*out->ports) * in->num_ports > cci->payload_max) { return CXL_MBOX_INVALID_INPUT; @@ -897,8 +900,8 @@ static CXLRetCode cmd_ccls_set_lsa(const struct cxl_cmd *cmd, const size_t hdr_len = offsetof(struct set_lsa_pl, data); *len_out = 0; - if (!len_in) { - return CXL_MBOX_SUCCESS; + if (len_in < hdr_len) { + return CXL_MBOX_INVALID_PAYLOAD_LENGTH; } if (set_lsa_payload->offset + len_in > cvc->get_lsa_size(ct3d) + hdr_len) { diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c index a05277674f2c64634e1489fe895545d50e8ec2bb..c45da149d99f25ec3ced04e0bbfd96a2cec7f11a 100644 --- a/hw/display/bcm2835_fb.c +++ b/hw/display/bcm2835_fb.c @@ -145,7 +145,7 @@ static bool fb_use_offsets(BCM2835FBConfig *config) * viewport size is larger than the physical screen. (It doesn't * prevent the guest setting this silly viewport setting, though...) */ - return config->xres_virtual > config->xres && + return config->xres_virtual > config->xres || config->yres_virtual > config->yres; } diff --git a/hw/display/macfb.c b/hw/display/macfb.c index d61541ccb5d5d3d13c65539bef4f237f690fc540..170da35757aded66339c41a195a857396d49d719 100644 --- a/hw/display/macfb.c +++ b/hw/display/macfb.c @@ -714,6 +714,7 @@ static void macfb_nubus_set_irq(void *opaque, int n, int level) static void macfb_nubus_realize(DeviceState *dev, Error **errp) { + ERRP_GUARD(); NubusDevice *nd = NUBUS_DEVICE(dev); MacfbNubusState *s = NUBUS_MACFB(dev); MacfbNubusDeviceClass *ndc = NUBUS_MACFB_GET_CLASS(dev); diff --git a/hw/display/vga.c b/hw/display/vga.c index 37557c3442aa8b709e74930502ed3a573da3d222..3f1358676bd99d59917e45e6b3e20c14ff814a3b 100644 --- a/hw/display/vga.c +++ b/hw/display/vga.c @@ -39,6 +39,8 @@ #include "migration/vmstate.h" #include "trace.h" +#include "sysemu/kvm.h" + //#define DEBUG_VGA_MEM //#define DEBUG_VGA_REG @@ -1748,6 +1750,13 @@ static void vga_draw_blank(VGACommonState *s, int full_update) if (s->last_scr_width <= 0 || s->last_scr_height <= 0) return; + if (is_buffer_shared(surface)) { + /* unshare buffer, otherwise the blanking corrupts vga vram */ + surface = qemu_create_displaysurface(s->last_scr_width, + s->last_scr_height); + dpy_gfx_replace_surface(s->con, surface); + } + w = s->last_scr_width * surface_bytes_per_pixel(surface); d = surface_data(surface); for(i = 0; i < s->last_scr_height; i++) { @@ -1783,6 +1792,11 @@ static void vga_update_display(void *opaque) s->cursor_blink_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); full_update = 1; } + + /* Force to full update in CSV guest. */ + if (kvm_csv3_enabled()) + full_update = 1; + switch(graphic_mode) { case GMODE_TEXT: vga_draw_text(s, full_update); diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c index 709c8a02a1468dfb26355bf363e717f1b410c183..373f04a7b43e4f6c7387847a2e70732dda1396d1 100644 --- a/hw/display/vhost-user-gpu.c +++ b/hw/display/vhost-user-gpu.c @@ -385,7 +385,7 @@ vhost_user_gpu_chr_read(void *opaque) } msg->request = request; - msg->flags = size; + msg->flags = flags; msg->size = size; if (request == VHOST_USER_GPU_CURSOR_UPDATE || diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c index 8bb7a2c21fe7878731cbe9289206ac4f1fc80535..9f34d0e6619cbb9a66a7398722ab134fbf490e5b 100644 --- a/hw/display/virtio-gpu-virgl.c +++ b/hw/display/virtio-gpu-virgl.c @@ -181,7 +181,7 @@ static void virgl_cmd_set_scanout(VirtIOGPU *g, memset(&info, 0, sizeof(info)); ret = virgl_renderer_resource_get_info(ss.resource_id, &info); #endif - if (ret == -1) { + if (ret) { qemu_log_mask(LOG_GUEST_ERROR, "%s: illegal resource specified %d\n", __func__, ss.resource_id); diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index b016d3bac85c27ccbfb64241d6d9e5f319957401..a7146388221f7dacfa50d2eea65beb3fa541cadb 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -679,10 +679,6 @@ static void virtio_gpu_do_set_scanout(VirtIOGPU *g, /* realloc the surface ptr */ scanout->ds = qemu_create_displaysurface_pixman(rect); - if (!scanout->ds) { - *error = VIRTIO_GPU_RESP_ERR_UNSPEC; - return; - } #ifdef WIN32 qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, fb->offset); #endif @@ -1418,9 +1414,6 @@ static int virtio_gpu_post_load(void *opaque, int version_id) return -EINVAL; } scanout->ds = qemu_create_displaysurface_pixman(res->image); - if (!scanout->ds) { - return -EINVAL; - } #ifdef WIN32 qemu_displaysurface_win32_set_handle(scanout->ds, res->handle, 0); #endif @@ -1463,10 +1456,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp) g->ctrl_vq = virtio_get_queue(vdev, 0); g->cursor_vq = virtio_get_queue(vdev, 1); - g->ctrl_bh = qemu_bh_new_guarded(virtio_gpu_ctrl_bh, g, - &qdev->mem_reentrancy_guard); - g->cursor_bh = qemu_bh_new_guarded(virtio_gpu_cursor_bh, g, - &qdev->mem_reentrancy_guard); + g->ctrl_bh = virtio_bh_new_guarded(qdev, virtio_gpu_ctrl_bh, g); + g->cursor_bh = virtio_bh_new_guarded(qdev, virtio_gpu_cursor_bh, g); g->reset_bh = qemu_bh_new(virtio_gpu_reset_bh, g); qemu_cond_init(&g->reset_cond); QTAILQ_INIT(&g->reslist); diff --git a/hw/gpio/aspeed_gpio.c b/hw/gpio/aspeed_gpio.c index 1e267dd48203c23d3266124ef0e14aa1bd275c64..0fc3d4c05fcf4766f50e230971388b06bec4f8f3 100644 --- a/hw/gpio/aspeed_gpio.c +++ b/hw/gpio/aspeed_gpio.c @@ -281,7 +281,7 @@ static void aspeed_gpio_update(AspeedGPIOState *s, GPIOSets *regs, diff &= mode_mask; if (diff) { for (gpio = 0; gpio < ASPEED_GPIOS_PER_SET; gpio++) { - uint32_t mask = 1 << gpio; + uint32_t mask = 1U << gpio; /* If the gpio needs to be updated... */ if (!(diff & mask)) { diff --git a/hw/i2c/smbus_slave.c b/hw/i2c/smbus_slave.c index 2ef2c7c5f69495464bc5965ae9b251db6425ad70..b35516a404c510768de3a61d967bd5a632aa57c8 100644 --- a/hw/i2c/smbus_slave.c +++ b/hw/i2c/smbus_slave.c @@ -25,11 +25,15 @@ #define DPRINTF(fmt, ...) \ do { printf("smbus(%02x): " fmt , dev->i2c.address, ## __VA_ARGS__); } while (0) #define BADF(fmt, ...) \ -do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__); exit(1);} while (0) +do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev)); \ + fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \ + exit(1); } while (0) #else #define DPRINTF(fmt, ...) do {} while(0) #define BADF(fmt, ...) \ -do { fprintf(stderr, "smbus: error: " fmt , ## __VA_ARGS__);} while (0) +do { g_autofree char *qom_path = object_get_canonical_path(OBJECT(dev)); \ + fprintf(stderr, "%s: smbus: error: " fmt , qom_path, ## __VA_ARGS__); \ + } while (0) #endif enum { diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 55850791df4148f5535eb06b76e09dabf75d84f1..908f29e02bc6a8ff84e3fe147d4da9dcbf739f41 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -10,6 +10,15 @@ config SGX bool depends on KVM +config CSV + bool + select HYGON_CSV_MIG_ACCEL + depends on SEV + +config HYGON_CSV_MIG_ACCEL + bool + depends on CSV + config PC bool imply APPLESMC @@ -26,6 +35,7 @@ config PC imply QXL imply SEV imply SGX + imply CSV imply TEST_DEVICES imply TPM_CRB imply TPM_TIS_ISA @@ -95,6 +105,7 @@ config Q35 imply E1000E_PCI_EXPRESS imply VMPORT imply VMMOUSE + imply IOMMUFD select PC_PCI select PC_ACPI select PCI_EXPRESS_Q35 diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 80db183b786afbe1fc44b04e45ac0eaa08f7f3d2..e10799ecc6b2ad9c1f63195126f48434a33ee425 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1545,8 +1545,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .smi_path = pm->smi_on_cpuhp ? "\\_SB.PCI0.SMI0.SMIC" : NULL, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; - build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, - pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02"); + build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, NULL, + pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", + AML_SYSTEM_IO); } if (pcms->memhp_io_base && nr_mem) { diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 4203144da9865efa61afc0df2fae12f8fd432728..12742b1433f82ffc2c482518eaa8671829dff851 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -346,12 +346,12 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, uint64_t gpa, IOMMUTLBEntry to_cache, uint16_t domid) { - AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); - uint64_t *key = g_new(uint64_t, 1); - uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; - /* don't cache erroneous translations */ if (to_cache.perm != IOMMU_NONE) { + AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); + uint64_t *key = g_new(uint64_t, 1); + uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), gpa, to_cache.translated_addr); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 5085a6fee3f1d4543b5947c1bc99cbc06982239d..60d86e0cb692c56f43f82feee9c3c2916fa83580 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -61,6 +61,12 @@ struct vtd_as_key { uint32_t pasid; }; +/* bus/devfn is PCI device's real BDF not the aliased one */ +struct vtd_hiod_key { + PCIBus *bus; + uint8_t devfn; +}; + struct vtd_iotlb_key { uint64_t gfn; uint32_t pasid; @@ -250,6 +256,25 @@ static guint vtd_as_hash(gconstpointer v) return (guint)(value << 8 | key->devfn); } +/* Same implementation as vtd_as_hash() */ +static guint vtd_hiod_hash(gconstpointer v) +{ + return vtd_as_hash(v); +} + +static gboolean vtd_hiod_equal(gconstpointer v1, gconstpointer v2) +{ + const struct vtd_hiod_key *key1 = v1; + const struct vtd_hiod_key *key2 = v2; + + return (key1->bus == key2->bus) && (key1->devfn == key2->devfn); +} + +static void vtd_hiod_destroy(gpointer v) +{ + object_unref(v); +} + static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, gpointer user_data) { @@ -2813,6 +2838,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s) if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { error_report_once("%s: RSV bit is set: val=0x%"PRIx64, __func__, val); + vtd_handle_inv_queue_error(s); return; } s->iq_tail = VTD_IQT_QT(s->iq_dw, val); @@ -3812,6 +3838,87 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, return vtd_dev_as; } +static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod, + Error **errp) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + int ret; + + if (!hiodc->get_cap) { + error_setg(errp, ".get_cap() not implemented"); + return false; + } + + /* Common checks */ + ret = hiodc->get_cap(hiod, HOST_IOMMU_DEVICE_CAP_AW_BITS, errp); + if (ret < 0) { + return false; + } + if (s->aw_bits > ret) { + error_setg(errp, "aw-bits %d > host aw-bits %d", s->aw_bits, ret); + return false; + } + + return true; +} + +static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *hiod, Error **errp) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + struct vtd_as_key *new_key; + + assert(hiod); + + vtd_iommu_lock(s); + + if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + error_setg(errp, "Host IOMMU device already exist"); + vtd_iommu_unlock(s); + return false; + } + + if (!vtd_check_hiod(s, hiod, errp)) { + vtd_iommu_unlock(s); + return false; + } + + new_key = g_malloc(sizeof(*new_key)); + new_key->bus = bus; + new_key->devfn = devfn; + + object_ref(hiod); + g_hash_table_insert(s->vtd_host_iommu_dev, new_key, hiod); + + vtd_iommu_unlock(s); + + return true; +} + +static void vtd_dev_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) +{ + IntelIOMMUState *s = opaque; + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + }; + + vtd_iommu_lock(s); + + if (!g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { + vtd_iommu_unlock(s); + return; + } + + g_hash_table_remove(s->vtd_host_iommu_dev, &key); + + vtd_iommu_unlock(s); +} + /* Unmap the whole range in the notifier's scope. */ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) { @@ -3934,30 +4041,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) return; } -/* Do the initialization. It will also be called when reset, so pay - * attention when adding new initialization stuff. - */ -static void vtd_init(IntelIOMMUState *s) +static void vtd_cap_init(IntelIOMMUState *s) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); - memset(s->csr, 0, DMAR_REG_SIZE); - memset(s->wmask, 0, DMAR_REG_SIZE); - memset(s->w1cmask, 0, DMAR_REG_SIZE); - memset(s->womask, 0, DMAR_REG_SIZE); - - s->root = 0; - s->root_scalable = false; - s->dmar_enabled = false; - s->intr_enabled = false; - s->iq_head = 0; - s->iq_tail = 0; - s->iq = 0; - s->iq_size = 0; - s->qi_enabled = false; - s->iq_last_desc_type = VTD_INV_DESC_NONE; - s->iq_dw = false; - s->next_frcd_reg = 0; s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | VTD_CAP_MGAW(s->aw_bits); @@ -3974,27 +4061,6 @@ static void vtd_init(IntelIOMMUState *s) } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; - /* - * Rsvd field masks for spte - */ - vtd_spte_rsvd[0] = ~0ULL; - vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); - vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); - - vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); - - if (s->scalable_mode || s->snoop_control) { - vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; - vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; - } - if (x86_iommu_ir_supported(x86_iommu)) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { @@ -4027,6 +4093,56 @@ static void vtd_init(IntelIOMMUState *s) if (s->pasid) { s->ecap |= VTD_ECAP_PASID; } +} + +/* + * Do the initialization. It will also be called when reset, so pay + * attention when adding new initialization stuff. + */ +static void vtd_init(IntelIOMMUState *s) +{ + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + memset(s->csr, 0, DMAR_REG_SIZE); + memset(s->wmask, 0, DMAR_REG_SIZE); + memset(s->w1cmask, 0, DMAR_REG_SIZE); + memset(s->womask, 0, DMAR_REG_SIZE); + + s->root = 0; + s->root_scalable = false; + s->dmar_enabled = false; + s->intr_enabled = false; + s->iq_head = 0; + s->iq_tail = 0; + s->iq = 0; + s->iq_size = 0; + s->qi_enabled = false; + s->iq_last_desc_type = VTD_INV_DESC_NONE; + s->iq_dw = false; + s->next_frcd_reg = 0; + + vtd_cap_init(s); + + /* + * Rsvd field masks for spte + */ + vtd_spte_rsvd[0] = ~0ULL; + vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + + vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, + x86_iommu->dt_supported); + + if (s->scalable_mode || s->snoop_control) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } vtd_reset_caches(s); @@ -4107,6 +4223,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, + .set_iommu_device = vtd_dev_set_iommu_device, + .unset_iommu_device = vtd_dev_unset_iommu_device, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4230,6 +4348,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) g_free, g_free); s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, g_free, g_free); + s->vtd_host_iommu_dev = g_hash_table_new_full(vtd_hiod_hash, vtd_hiod_equal, + g_free, vtd_hiod_destroy); vtd_init(s); pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 29b9964733ed118f330753c04146bebf4580cddf..204e34db86c53361f217b3b9174d015e3c5aeb08 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -743,6 +743,111 @@ void xen_load_linux(PCMachineState *pcms) x86ms->fw_cfg = fw_cfg; } +static int try_create_2MB_page(uint32_t page_num) +{ + char nr_hp_num_s[256] = {0}; + char free_hp_num_s[256] = {0}; + const char *nr_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages"; + const char *free_hugepages_dir = "/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"; + int nr_hp_num = -1, free_hp_num = -1, ret = -1; + int nr_fd = qemu_open_old(nr_hugepages_dir, O_RDWR); + int free_fd = qemu_open_old(free_hugepages_dir, O_RDONLY); + + if (nr_fd < 0 || free_fd < 0) { + error_report("%s: qemu_open failed: %s\n", __func__, strerror(errno)); + goto end; + } + + if (read(nr_fd, nr_hp_num_s, 256) < 0) + goto end; + if (read(free_fd, free_hp_num_s, 256) < 0) + goto end; + + nr_hp_num = atoi(nr_hp_num_s); + free_hp_num = atoi(free_hp_num_s); + if (nr_hp_num < 0 || free_hp_num < 0) + goto end; + + if (page_num <= free_hp_num) { + ret = 0; + goto end; + } + + nr_hp_num += (page_num - free_hp_num); + snprintf (nr_hp_num_s, 256, "%d", nr_hp_num); + if (write(nr_fd, nr_hp_num_s, strlen(nr_hp_num_s)) < 0) + goto end; + + ret = 0; +end: + if (nr_fd >= 0) + close(nr_fd); + if (free_fd >= 0) + close(free_fd); + return ret; +} + +#define HUGEPAGE_NUM_MAX 128 +#define HUGEPAGE_SIZE (1024*1024*2) +static void mem2_init(MachineState *ms, MemoryRegion *system_memory) +{ + MemoryRegion *mem2_mr; + char mr_name[128] = {0}; + void *ram = NULL; + int ret = 0, lock_fd; + const char *lock_file = "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages"; + uint32_t page_num = ms->ram2_size / HUGEPAGE_SIZE, i; + + if (HUGEPAGE_NUM_MAX < page_num) { + error_report("\"-mem2 'size='\" needs to Less than %dM\n", + (HUGEPAGE_SIZE * HUGEPAGE_NUM_MAX) / (1024 * 1024)); + exit(EXIT_FAILURE); + } + + // Apply for hugepages from OS and use them, which needs to be synchronized + lock_fd = qemu_open_old(lock_file, O_WRONLY); + if (lock_fd < 0) { + error_report("%s: open %s failed: %s\n", __func__, lock_file, strerror(errno)); + exit(EXIT_FAILURE); + } + + while (qemu_lock_fd(lock_fd, 0, 0, true)) { + if (errno != EACCES && errno != EAGAIN) { + error_report("qemu_lock_fd failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } + + /** try to create hugepage. + * If there are enough free hugepages, then do nothing. + */ + ret = try_create_2MB_page(page_num); + if (ret) { + error_report("%s: Failed to allocate hugepage\n", __func__); + goto unlock; + } + + for (i = 0; i < page_num; ++i) { + mem2_mr = g_malloc(sizeof(*mem2_mr)); + ram = mmap(NULL, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0); + if (ram == MAP_FAILED) { + error_report("%s: mmap failed: %s", __func__, strerror(errno)); + goto unlock; + } + + sprintf(mr_name, "mem2-%d", i); + memory_region_init_ram_ptr(mem2_mr, NULL, mr_name, HUGEPAGE_SIZE, ram); + memory_region_add_subregion(system_memory, ms->ram2_base + (i * HUGEPAGE_SIZE), mem2_mr); + } + + ret = 0; +unlock: + qemu_unlock_fd(lock_fd, 0, 0); + if (ret) + exit(EXIT_FAILURE); +} + #define PC_ROM_MIN_VGA 0xc0000 #define PC_ROM_MIN_OPTION 0xc8000 #define PC_ROM_MAX 0xe0000 @@ -965,6 +1070,22 @@ void pc_memory_init(PCMachineState *pcms, E820_RAM); } + if (machine->ram2_size && machine->ram2_base) { + if (0x100000000ULL + x86ms->above_4g_mem_size > machine->ram2_base) { + error_report("\"-mem2 'base'\" needs to greater 0x%llx\n", + 0x100000000ULL + x86ms->above_4g_mem_size); + exit(EXIT_FAILURE); + } + if (machine->ram2_base & (HUGEPAGE_SIZE - 1) || + machine->ram2_size & (HUGEPAGE_SIZE - 1)) { + error_report("\"-mem2 'base|size'\" needs to aligned to 0x%x\n", HUGEPAGE_SIZE); + exit(EXIT_FAILURE); + } + + mem2_init(machine, system_memory); + e820_add_entry(machine->ram2_base, machine->ram2_size, E820_RAM); + } + if (pcms->sgx_epc.size != 0) { e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); } diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index c8d9e71b889b673bc64d89f3d01a240007610552..7c6a9102501b9146372ac0674cf24e85511ef4b4 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -37,6 +37,7 @@ #include "hw/block/flash.h" #include "sysemu/kvm.h" #include "sev.h" +#include "csv.h" #define FLASH_SECTOR_SIZE 4096 @@ -263,7 +264,21 @@ void x86_firmware_configure(void *ptr, int size) error_report("failed to locate and/or save reset vector"); exit(1); } + if (csv3_enabled()) { + ram_addr_t offset = 0; + MemoryRegion *mr; - sev_encrypt_flash(ptr, size, &error_fatal); + if (kvm_csv3_should_set_priv_mem()) + csv3_set_guest_private_memory(&error_fatal); + + mr = memory_region_from_host(ptr, &offset); + if (!mr) { + error_report("failed to get memory region of flash"); + exit(1); + } + csv3_load_data(mr->addr + offset, ptr, size, &error_fatal); + } else { + sev_encrypt_flash(ptr, size, &error_fatal); + } } } diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index afdc44b8e056dc4e6f999d699b47d65e896f951c..8062e1743c1d6d2cf3b7643f81afffcd5c0fcfcc 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -1519,8 +1519,10 @@ static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); - tx_bytes += le32_to_cpu(ad->cur_cmd->status); - ad->cur_cmd->status = cpu_to_le32(tx_bytes); + if (ad->cur_cmd) { + tx_bytes += le32_to_cpu(ad->cur_cmd->status); + ad->cur_cmd->status = cpu_to_le32(tx_bytes); + } } static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write) diff --git a/hw/ide/macio.c b/hw/ide/macio.c index dca1cc9efc1d435a098d28395e652b04561374ec..3d895c07f459faab1d1a2bd1f19b5d51bbc9cf00 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -119,9 +119,6 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) return; done: - dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len, - io->dir, io->dma_len); - if (ret < 0) { block_acct_failed(blk_get_stats(s->blk), &s->acct); } else { @@ -202,9 +199,6 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) return; done: - dma_memory_unmap(&address_space_memory, io->dma_mem, io->dma_len, - io->dir, io->dma_len); - if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { if (ret < 0) { block_acct_failed(blk_get_stats(s->blk), &s->acct); diff --git a/hw/input/ps2.c b/hw/input/ps2.c index c8fd23cf3600f415fe68e9259ed57ebaa09a8863..b647561069023d65349143df7a6e5abd33ae3968 100644 --- a/hw/input/ps2.c +++ b/hw/input/ps2.c @@ -167,7 +167,7 @@ void ps2_queue_noirq(PS2State *s, int b) } q->data[q->wptr] = b; - if (++q->wptr == PS2_BUFFER_SIZE) { + if (++q->wptr >= PS2_BUFFER_SIZE) { q->wptr = 0; } q->count++; @@ -557,7 +557,7 @@ uint32_t ps2_read_data(PS2State *s) val = q->data[index]; } else { val = q->data[q->rptr]; - if (++q->rptr == PS2_BUFFER_SIZE) { + if (++q->rptr >= PS2_BUFFER_SIZE) { q->rptr = 0; } q->count--; diff --git a/hw/intc/Kconfig b/hw/intc/Kconfig index 97d550b06b77d16d08410183af798c50a1535bb0..91c7aa668e3bf908c3f86d242dd0403066eb8dc4 100644 --- a/hw/intc/Kconfig +++ b/hw/intc/Kconfig @@ -93,10 +93,16 @@ config NIOS2_VIC config LOONGARCH_IPI bool +config LOONGARCH_IPI_KVM + bool + config LOONGARCH_PCH_PIC bool select UNIMP +config LOONGARCH_PCH_PIC_KVM + bool + config LOONGARCH_PCH_MSI select MSI_NONBROKEN bool @@ -104,3 +110,6 @@ config LOONGARCH_PCH_MSI config LOONGARCH_EXTIOI bool + +config LOONGARCH_EXTIOI_KVM + bool diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index 074cf50af25da0321eecf1e42c0d4d9fc86e2a04..f0582f7a49729e81c1f99e9e6393d33c16260ce2 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -1263,9 +1263,14 @@ static void gic_dist_writeb(void *opaque, hwaddr offset, trace_gic_enable_irq(irq + i); } GIC_DIST_SET_ENABLED(irq + i, cm); - /* If a raised level triggered IRQ enabled then mark - is as pending. */ - if (GIC_DIST_TEST_LEVEL(irq + i, mask) + /* + * If a raised level triggered IRQ enabled then mark + * it as pending on 11MPCore. For other GIC revisions we + * handle the "level triggered and line asserted" check + * at the other end in gic_test_pending(). + */ + if (s->revision == REV_11MPCORE + && GIC_DIST_TEST_LEVEL(irq + i, mask) && !GIC_DIST_TEST_EDGE_TRIGGER(irq + i)) { DPRINTF("Set %d pending mask %x\n", irq + i, mask); GIC_DIST_SET_PENDING(irq + i, mask); @@ -1658,7 +1663,7 @@ static MemTxResult gic_cpu_read(GICState *s, int cpu, int offset, *data = s->h_apr[gic_get_vcpu_real_id(cpu)]; } else if (gic_cpu_ns_access(s, cpu, attrs)) { /* NS view of GICC_APR is the top half of GIC_NSAPR */ - *data = gic_apr_ns_view(s, regno, cpu); + *data = gic_apr_ns_view(s, cpu, regno); } else { *data = s->apr[regno][cpu]; } @@ -1746,7 +1751,7 @@ static MemTxResult gic_cpu_write(GICState *s, int cpu, int offset, s->h_apr[gic_get_vcpu_real_id(cpu)] = value; } else if (gic_cpu_ns_access(s, cpu, attrs)) { /* NS view of GICC_APR is the top half of GIC_NSAPR */ - gic_apr_write_ns_view(s, regno, cpu, value); + gic_apr_write_ns_view(s, cpu, regno, value); } else { s->apr[regno][cpu] = value; } diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c index 0b8f79a122765f31c28cbd51464b1e407b5ac90a..e1c7c8c4bc804598426c2645b5e2175a604a02f4 100644 --- a/hw/intc/arm_gicv3.c +++ b/hw/intc/arm_gicv3.c @@ -410,6 +410,7 @@ static void arm_gicv3_class_init(ObjectClass *klass, void *data) ARMGICv3Class *agc = ARM_GICV3_CLASS(klass); agcc->post_load = arm_gicv3_post_load; + agcc->init_cpu_reginfo = gicv3_init_cpu_reginfo; device_class_set_parent_realize(dc, arm_gic_realize, &agc->parent_realize); } diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index 2ebf880eada850e7880aa478ec0b44bafa5fe9ef..5667d9f40b453ce9cd4956260bea37e0b860205f 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -25,6 +25,7 @@ #include "qapi/error.h" #include "qemu/module.h" #include "qemu/error-report.h" +#include "hw/boards.h" #include "hw/core/cpu.h" #include "hw/intc/arm_gicv3_common.h" #include "hw/qdev-properties.h" @@ -33,7 +34,6 @@ #include "hw/arm/linux-boot-if.h" #include "sysemu/kvm.h" - static void gicv3_gicd_no_migration_shift_bug_post_load(GICv3State *cs) { if (cs->gicd_no_migration_shift_bug) { @@ -322,6 +322,61 @@ void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, } } +static int arm_gicv3_get_proc_num(GICv3State *s, CPUState *cpu) +{ + uint64_t mp_affinity; + uint64_t gicr_typer; + uint64_t cpu_affid; + int i; + + mp_affinity = object_property_get_uint(OBJECT(cpu), "mp-affinity", NULL); + /* match the cpu mp-affinity to get the gic cpuif number */ + for (i = 0; i < s->num_cpu; i++) { + gicr_typer = s->cpu[i].gicr_typer; + cpu_affid = (gicr_typer >> 32) & 0xFFFFFF; + if (cpu_affid == mp_affinity) { + return i; + } + } + + return -1; +} + +static void arm_gicv3_cpu_update_notifier(Notifier *notifier, void * data) +{ + GICv3CPUHotplugInfo *gic_info = (GICv3CPUHotplugInfo *)data; + CPUState *cpu = gic_info->cpu; + ARMGICv3CommonClass *c; + int gic_cpuif_num; + GICv3State *s; + + s = ARM_GICV3_COMMON(gic_info->gic); + c = ARM_GICV3_COMMON_GET_CLASS(s); + + /* this shall get us mapped gicv3 cpuif corresponding to mpidr */ + gic_cpuif_num = arm_gicv3_get_proc_num(s, cpu); + if (gic_cpuif_num < 0) { + error_report("Failed to associate cpu %d with any GIC cpuif", + cpu->cpu_index); + abort(); + } + + /* check if update is for vcpu hot-unplug */ + if (qemu_enabled_cpu(cpu)) { + s->cpu[gic_cpuif_num].cpu = NULL; + return; + } + + /* re-stitch the gic cpuif to this new cpu */ + gicv3_set_gicv3state(cpu, &s->cpu[gic_cpuif_num]); + gicv3_set_cpustate(&s->cpu[gic_cpuif_num], cpu); + + /* initialize the registers info for this newly added cpu */ + if (c->init_cpu_reginfo) { + c->init_cpu_reginfo(cpu); + } +} + static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) { GICv3State *s = ARM_GICV3_COMMON(dev); @@ -392,10 +447,13 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) s->cpu = g_new0(GICv3CPUState, s->num_cpu); for (i = 0; i < s->num_cpu; i++) { - CPUState *cpu = qemu_get_cpu(i); + CPUState *cpu = qemu_get_possible_cpu(i) ? : qemu_get_cpu(i); uint64_t cpu_affid; - s->cpu[i].cpu = cpu; + if (qemu_enabled_cpu(cpu)) { + s->cpu[i].cpu = cpu; + } + s->cpu[i].gic = s; /* Store GICv3CPUState in CPUARMState gicv3state pointer */ gicv3_set_gicv3state(cpu, &s->cpu[i]); @@ -441,13 +499,20 @@ static void arm_gicv3_common_realize(DeviceState *dev, Error **errp) s->cpu[cpuidx - 1].gicr_typer |= GICR_TYPER_LAST; } + s->cpu_update_notifier.notify = arm_gicv3_cpu_update_notifier; + s->itslist = g_ptr_array_new(); } static void arm_gicv3_finalize(Object *obj) { GICv3State *s = ARM_GICV3_COMMON(obj); + Object *ms = qdev_get_machine(); + MachineClass *mc = MACHINE_GET_CLASS(ms); + if (mc->has_hotpluggable_cpus) { + notifier_remove(&s->cpu_update_notifier); + } g_free(s->redist_region_count); } diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index ab1a00508e6f178c6059c8644492a4b455febd87..a0135100744115fe3326dc965c662708f43799ad 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -934,6 +934,10 @@ void gicv3_cpuif_update(GICv3CPUState *cs) ARMCPU *cpu = ARM_CPU(cs->cpu); CPUARMState *env = &cpu->env; + if (!qemu_enabled_cpu(cs->cpu)) { + return; + } + g_assert(qemu_mutex_iothread_locked()); trace_gicv3_cpuif_update(gicv3_redist_affid(cs), cs->hppi.irq, @@ -1826,6 +1830,10 @@ static void icc_generate_sgi(CPUARMState *env, GICv3CPUState *cs, for (i = 0; i < s->num_cpu; i++) { GICv3CPUState *ocs = &s->cpu[i]; + if (!qemu_enabled_cpu(ocs->cpu)) { + continue; + } + if (irm) { /* IRM == 1 : route to all CPUs except self */ if (cs == ocs) { @@ -2774,6 +2782,127 @@ static const ARMCPRegInfo gicv3_cpuif_ich_apxr23_reginfo[] = { }, }; +void gicv3_init_cpu_reginfo(CPUState *cs) +{ + ARMCPU *cpu = ARM_CPU(cs); + GICv3CPUState *gcs = icc_cs_from_env(&cpu->env); + + /* + * If the CPU doesn't define a GICv3 configuration, probably because + * in real hardware it doesn't have one, then we use default values + * matching the one used by most Arm CPUs. This applies to: + * cpu->gic_num_lrs + * cpu->gic_vpribits + * cpu->gic_vprebits + * cpu->gic_pribits + */ + + /* + * Note that we can't just use the GICv3CPUState as an opaque pointer + * in define_arm_cp_regs_with_opaque(), because when we're called back + * it might be with code translated by CPU 0 but run by CPU 1, in + * which case we'd get the wrong value. + * So instead we define the regs with no ri->opaque info, and + * get back to the GICv3CPUState from the CPUARMState. + */ + define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); + + /* + * The CPU implementation specifies the number of supported + * bits of physical priority. For backwards compatibility + * of migration, we have a compat property that forces use + * of 8 priority bits regardless of what the CPU really has. + */ + if (gcs->gic->force_8bit_prio) { + gcs->pribits = 8; + } else { + gcs->pribits = cpu->gic_pribits ?: 5; + } + + /* + * The GICv3 has separate ID register fields for virtual priority + * and preemption bit values, but only a single ID register field + * for the physical priority bits. The preemption bit count is + * always the same as the priority bit count, except that 8 bits + * of priority means 7 preemption bits. We precalculate the + * preemption bits because it simplifies the code and makes the + * parallels between the virtual and physical bits of the GIC + * a bit clearer. + */ + gcs->prebits = gcs->pribits; + if (gcs->prebits == 8) { + gcs->prebits--; + } + /* + * Check that CPU code defining pribits didn't violate + * architectural constraints our implementation relies on. + */ + g_assert(gcs->pribits >= 4 && gcs->pribits <= 8); + + /* + * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions + * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them. + */ + if (gcs->prebits >= 6) { + define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo); + } + if (gcs->prebits == 7) { + define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo); + } + + if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) { + int j; + + gcs->num_list_regs = cpu->gic_num_lrs ?: 4; + gcs->vpribits = cpu->gic_vpribits ?: 5; + gcs->vprebits = cpu->gic_vprebits ?: 5; + + /* + * Check against architectural constraints: getting these + * wrong would be a bug in the CPU code defining these, + * and the implementation relies on them holding. + */ + g_assert(gcs->vprebits <= gcs->vpribits); + g_assert(gcs->vprebits >= 5 && gcs->vprebits <= 7); + g_assert(gcs->vpribits >= 5 && gcs->vpribits <= 8); + + define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo); + + for (j = 0; j < gcs->num_list_regs; j++) { + /* + * Note that the AArch64 LRs are 64-bit; the AArch32 LRs + * are split into two cp15 regs, LR (the low part, with the + * same encoding as the AArch64 LR) and LRC (the high part). + */ + ARMCPRegInfo lr_regset[] = { + { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH, + .opc0 = 3, .opc1 = 4, .crn = 12, + .crm = 12 + (j >> 3), .opc2 = j & 7, + .type = ARM_CP_IO | ARM_CP_NO_RAW, + .access = PL2_RW, + .readfn = ich_lr_read, + .writefn = ich_lr_write, + }, + { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32, + .cp = 15, .opc1 = 4, .crn = 12, + .crm = 14 + (j >> 3), .opc2 = j & 7, + .type = ARM_CP_IO | ARM_CP_NO_RAW, + .access = PL2_RW, + .readfn = ich_lr_read, + .writefn = ich_lr_write, + }, + }; + define_arm_cp_regs(cpu, lr_regset); + } + if (gcs->vprebits >= 6) { + define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo); + } + if (gcs->vprebits == 7) { + define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo); + } + } +} + static void gicv3_cpuif_el_change_hook(ARMCPU *cpu, void *opaque) { GICv3CPUState *cs = opaque; @@ -2796,131 +2925,23 @@ void gicv3_init_cpuif(GICv3State *s) for (i = 0; i < s->num_cpu; i++) { ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i)); - GICv3CPUState *cs = &s->cpu[i]; - - /* - * If the CPU doesn't define a GICv3 configuration, probably because - * in real hardware it doesn't have one, then we use default values - * matching the one used by most Arm CPUs. This applies to: - * cpu->gic_num_lrs - * cpu->gic_vpribits - * cpu->gic_vprebits - * cpu->gic_pribits - */ - - /* Note that we can't just use the GICv3CPUState as an opaque pointer - * in define_arm_cp_regs_with_opaque(), because when we're called back - * it might be with code translated by CPU 0 but run by CPU 1, in - * which case we'd get the wrong value. - * So instead we define the regs with no ri->opaque info, and - * get back to the GICv3CPUState from the CPUARMState. - * - * These CP regs callbacks can be called from either TCG or HVF code. - */ - define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); - - /* - * The CPU implementation specifies the number of supported - * bits of physical priority. For backwards compatibility - * of migration, we have a compat property that forces use - * of 8 priority bits regardless of what the CPU really has. - */ - if (s->force_8bit_prio) { - cs->pribits = 8; - } else { - cs->pribits = cpu->gic_pribits ?: 5; - } - - /* - * The GICv3 has separate ID register fields for virtual priority - * and preemption bit values, but only a single ID register field - * for the physical priority bits. The preemption bit count is - * always the same as the priority bit count, except that 8 bits - * of priority means 7 preemption bits. We precalculate the - * preemption bits because it simplifies the code and makes the - * parallels between the virtual and physical bits of the GIC - * a bit clearer. - */ - cs->prebits = cs->pribits; - if (cs->prebits == 8) { - cs->prebits--; - } - /* - * Check that CPU code defining pribits didn't violate - * architectural constraints our implementation relies on. - */ - g_assert(cs->pribits >= 4 && cs->pribits <= 8); - - /* - * gicv3_cpuif_reginfo[] defines ICC_AP*R0_EL1; add definitions - * for ICC_AP*R{1,2,3}_EL1 if the prebits value requires them. - */ - if (cs->prebits >= 6) { - define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr1_reginfo); - } - if (cs->prebits == 7) { - define_arm_cp_regs(cpu, gicv3_cpuif_icc_apxr23_reginfo); - } - - if (arm_feature(&cpu->env, ARM_FEATURE_EL2)) { - int j; - cs->num_list_regs = cpu->gic_num_lrs ?: 4; - cs->vpribits = cpu->gic_vpribits ?: 5; - cs->vprebits = cpu->gic_vprebits ?: 5; - - /* Check against architectural constraints: getting these - * wrong would be a bug in the CPU code defining these, - * and the implementation relies on them holding. - */ - g_assert(cs->vprebits <= cs->vpribits); - g_assert(cs->vprebits >= 5 && cs->vprebits <= 7); - g_assert(cs->vpribits >= 5 && cs->vpribits <= 8); - - define_arm_cp_regs(cpu, gicv3_cpuif_hcr_reginfo); - - for (j = 0; j < cs->num_list_regs; j++) { - /* Note that the AArch64 LRs are 64-bit; the AArch32 LRs - * are split into two cp15 regs, LR (the low part, with the - * same encoding as the AArch64 LR) and LRC (the high part). + if (qemu_enabled_cpu(CPU(cpu))) { + GICv3CPUState *cs = icc_cs_from_env(&cpu->env); + gicv3_init_cpu_reginfo(CPU(cpu)); + if (tcg_enabled() || qtest_enabled()) { + /* + * We can only trap EL changes with TCG. However the GIC + * interrupt state only changes on EL changes involving EL2 or + * EL3, so for the non-TCG case this is OK, as EL2 and EL3 can't + * exist. */ - ARMCPRegInfo lr_regset[] = { - { .name = "ICH_LRn_EL2", .state = ARM_CP_STATE_BOTH, - .opc0 = 3, .opc1 = 4, .crn = 12, - .crm = 12 + (j >> 3), .opc2 = j & 7, - .type = ARM_CP_IO | ARM_CP_NO_RAW, - .access = PL2_RW, - .readfn = ich_lr_read, - .writefn = ich_lr_write, - }, - { .name = "ICH_LRCn_EL2", .state = ARM_CP_STATE_AA32, - .cp = 15, .opc1 = 4, .crn = 12, - .crm = 14 + (j >> 3), .opc2 = j & 7, - .type = ARM_CP_IO | ARM_CP_NO_RAW, - .access = PL2_RW, - .readfn = ich_lr_read, - .writefn = ich_lr_write, - }, - }; - define_arm_cp_regs(cpu, lr_regset); - } - if (cs->vprebits >= 6) { - define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr1_reginfo); - } - if (cs->vprebits == 7) { - define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo); + arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, + cs); + } else { + assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2)); + assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3)); } } - if (tcg_enabled() || qtest_enabled()) { - /* - * We can only trap EL changes with TCG. However the GIC interrupt - * state only changes on EL changes involving EL2 or EL3, so for - * the non-TCG case this is OK, as EL2 and EL3 can't exist. - */ - arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs); - } else { - assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2)); - assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3)); - } } } diff --git a/hw/intc/arm_gicv3_cpuif_common.c b/hw/intc/arm_gicv3_cpuif_common.c index ff1239f65db946da62754436fbf67fbd50ce378c..381cf2754b7e4095d547aefc716c7d2937d49369 100644 --- a/hw/intc/arm_gicv3_cpuif_common.c +++ b/hw/intc/arm_gicv3_cpuif_common.c @@ -20,3 +20,8 @@ void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s) env->gicv3state = (void *)s; }; + +void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu) +{ + s->cpu = cpu; +} diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 77eb37e13173f8dc0e71614ac0c511fe4f23f97b..dd2a60fa20d2341568a9fcdfd9da3114b9ed73cc 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -24,6 +24,7 @@ #include "hw/intc/arm_gicv3_common.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "sysemu/cpus.h" #include "sysemu/kvm.h" #include "sysemu/runstate.h" #include "kvm_arm.h" @@ -458,6 +459,18 @@ static void kvm_arm_gicv3_put(GICv3State *s) GICv3CPUState *c = &s->cpu[ncpu]; int num_pri_bits; + /* + * To support hotplug of vcpus we need to make sure all gic cpuif/GICC + * are initialized at machvirt init time. Once the init is done we + * release the ARMCPU object for disabled vcpus but this leg could hit + * during reset of GICC later as well i.e. after init has happened and + * all of the cases we want to make sure we dont acess the GICC for + * the disabled VCPUs. + */ + if (!qemu_enabled_cpu(c->cpu)) { + continue; + } + kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, true); kvm_gicc_access(s, ICC_CTLR_EL1, ncpu, &c->icc_ctlr_el1[GICV3_NS], true); @@ -616,6 +629,11 @@ static void kvm_arm_gicv3_get(GICv3State *s) GICv3CPUState *c = &s->cpu[ncpu]; int num_pri_bits; + /* don't access GICC for the disabled vCPUs. */ + if (!qemu_enabled_cpu(c->cpu)) { + continue; + } + kvm_gicc_access(s, ICC_SRE_EL1, ncpu, &c->icc_sre_el1, false); kvm_gicc_access(s, ICC_CTLR_EL1, ncpu, &c->icc_ctlr_el1[GICV3_NS], false); @@ -695,10 +713,19 @@ static void arm_gicv3_icc_reset(CPUARMState *env, const ARMCPRegInfo *ri) return; } + /* + * This shall be called even when vcpu is being hotplugged or onlined and + * other vcpus might be running. Host kernel KVM code to handle device + * access of IOCTLs KVM_{GET|SET}_DEVICE_ATTR might fail due to inability to + * grab vcpu locks for all the vcpus. Hence, we need to pause all vcpus to + * facilitate locking within host. + */ + pause_all_vcpus(); /* Initialize to actual HW supported configuration */ kvm_device_access(s->dev_fd, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, KVM_VGIC_ATTR(ICC_CTLR_EL1, c->gicr_typer), &c->icc_ctlr_el1[GICV3_NS], false, &error_abort); + resume_all_vcpus(); c->icc_ctlr_el1[GICV3_S] = c->icc_ctlr_el1[GICV3_NS]; } @@ -777,6 +804,10 @@ static void vm_change_state_handler(void *opaque, bool running, } } +static void kvm_gicv3_init_cpu_reginfo(CPUState *cs) +{ + define_arm_cp_regs(ARM_CPU(cs), gicv3_cpuif_reginfo); +} static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) { @@ -808,9 +839,10 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) gicv3_init_irqs_and_mmio(s, kvm_arm_gicv3_set_irq, NULL); for (i = 0; i < s->num_cpu; i++) { - ARMCPU *cpu = ARM_CPU(qemu_get_cpu(i)); - - define_arm_cp_regs(cpu, gicv3_cpuif_reginfo); + CPUState *cs = qemu_get_cpu(i); + if (qemu_enabled_cpu(cs)) { + kvm_gicv3_init_cpu_reginfo(cs); + } } /* Try to create the device via the device control API */ @@ -897,6 +929,7 @@ static void kvm_arm_gicv3_class_init(ObjectClass *klass, void *data) agcc->pre_save = kvm_arm_gicv3_get; agcc->post_load = kvm_arm_gicv3_put; + agcc->init_cpu_reginfo = kvm_gicv3_init_cpu_reginfo; device_class_set_parent_realize(dc, kvm_arm_gicv3_realize, &kgc->parent_realize); resettable_class_set_parent_phases(rc, NULL, kvm_arm_gicv3_reset_hold, NULL, diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h index 29d5cdc1b6984365320c4c1b22eb16358950d01f..0bed0f6e2af2a566452094ba9028fcc1cb859d1c 100644 --- a/hw/intc/gicv3_internal.h +++ b/hw/intc/gicv3_internal.h @@ -709,6 +709,7 @@ void gicv3_redist_vinvall(GICv3CPUState *cs, uint64_t vptaddr); void gicv3_redist_send_sgi(GICv3CPUState *cs, int grp, int irq, bool ns); void gicv3_init_cpuif(GICv3State *s); +void gicv3_init_cpu_reginfo(CPUState *cs); /** * gicv3_cpuif_update: @@ -848,5 +849,6 @@ static inline void gicv3_cache_all_target_cpustates(GICv3State *s) } void gicv3_set_gicv3state(CPUState *cpu, GICv3CPUState *s); +void gicv3_set_cpustate(GICv3CPUState *s, CPUState *cpu); #endif /* QEMU_ARM_GICV3_INTERNAL_H */ diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 716ffc8bbbda1db4cfba8ecbdbce04ffc8455564..0b43aec8fa97504f7c8dc0b9450ffa1a0b7388d8 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -195,6 +195,7 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) int i; if (kvm_irqchip_is_split()) { + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); for (i = 0; i < IOAPIC_NUM_PINS; i++) { MSIMessage msg; struct ioapic_entry_info info; @@ -202,10 +203,10 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) if (!info.masked) { msg.address = info.addr; msg.data = info.data; - kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL); + kvm_irqchip_update_msi_route(&c, i, msg, NULL); } } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); } #endif } diff --git a/hw/intc/loongarch_extioi.c b/hw/intc/loongarch_extioi.c index 24fb3af8cc31fceda1169b9e26f549ac6b299a46..fa23e247ca331434e6dd653cca4399dda5b1fd36 100644 --- a/hw/intc/loongarch_extioi.c +++ b/hw/intc/loongarch_extioi.c @@ -8,6 +8,7 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qemu/log.h" +#include "qapi/error.h" #include "hw/irq.h" #include "hw/sysbus.h" #include "hw/loongarch/virt.h" @@ -32,23 +33,23 @@ static void extioi_update_irq(LoongArchExtIOI *s, int irq, int level) if (((s->enable[irq_index]) & irq_mask) == 0) { return; } - s->coreisr[cpu][irq_index] |= irq_mask; - found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS); - set_bit(irq, s->sw_isr[cpu][ipnum]); + s->cpu[cpu].coreisr[irq_index] |= irq_mask; + found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS); + set_bit(irq, s->cpu[cpu].sw_isr[ipnum]); if (found < EXTIOI_IRQS) { /* other irq is handling, need not update parent irq level */ return; } } else { - s->coreisr[cpu][irq_index] &= ~irq_mask; - clear_bit(irq, s->sw_isr[cpu][ipnum]); - found = find_first_bit(s->sw_isr[cpu][ipnum], EXTIOI_IRQS); + s->cpu[cpu].coreisr[irq_index] &= ~irq_mask; + clear_bit(irq, s->cpu[cpu].sw_isr[ipnum]); + found = find_first_bit(s->cpu[cpu].sw_isr[ipnum], EXTIOI_IRQS); if (found < EXTIOI_IRQS) { /* other irq is handling, need not update parent irq level */ return; } } - qemu_set_irq(s->parent_irq[cpu][ipnum], level); + qemu_set_irq(s->cpu[cpu].parent_irq[ipnum], level); } static void extioi_setirq(void *opaque, int irq, int level) @@ -96,7 +97,7 @@ static MemTxResult extioi_readw(void *opaque, hwaddr addr, uint64_t *data, index = (offset - EXTIOI_COREISR_START) >> 2; /* using attrs to get current cpu index */ cpu = attrs.requester_id; - *data = s->coreisr[cpu][index]; + *data = s->cpu[cpu].coreisr[index]; break; case EXTIOI_COREMAP_START ... EXTIOI_COREMAP_END - 1: index = (offset - EXTIOI_COREMAP_START) >> 2; @@ -129,12 +130,68 @@ static inline void extioi_enable_irq(LoongArchExtIOI *s, int index,\ } } +static inline void extioi_update_sw_coremap(LoongArchExtIOI *s, int irq, + uint64_t val, bool notify) +{ + int i, cpu; + + /* + * loongarch only support little endian, + * so we paresd the value with little endian. + */ + val = cpu_to_le64(val); + + for (i = 0; i < 4; i++) { + cpu = val & 0xff; + if (!(s->status & BIT(EXTIOI_ENABLE_CPU_ENCODE))) { + cpu = ctz32(cpu); + cpu = (cpu >= 4) ? 0 : cpu; + } + val = val >> 8; + + if (s->sw_coremap[irq + i] == cpu) { + continue; + } + + if (notify && test_bit(irq + i, (unsigned long *)s->isr)) { + /* + * lower irq at old cpu and raise irq at new cpu + */ + extioi_update_irq(s, irq + i, 0); + s->sw_coremap[irq + i] = cpu; + extioi_update_irq(s, irq + i, 1); + } else { + s->sw_coremap[irq + i] = cpu; + } + } +} + +static inline void extioi_update_sw_ipmap(LoongArchExtIOI *s, int index, + uint64_t val) +{ + int i; + uint8_t ipnum; + + /* + * loongarch only support little endian, + * so we paresd the value with little endian. + */ + val = cpu_to_le64(val); + for (i = 0; i < 4; i++) { + ipnum = val & 0xff; + ipnum = ctz32(ipnum); + ipnum = (ipnum >= 4) ? 0 : ipnum; + s->sw_ipmap[index * 4 + i] = ipnum; + val = val >> 8; + } +} + static MemTxResult extioi_writew(void *opaque, hwaddr addr, uint64_t val, unsigned size, MemTxAttrs attrs) { LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); - int i, cpu, index, old_data, irq; + int cpu, index, old_data, irq; uint32_t offset; trace_loongarch_extioi_writew(addr, val); @@ -152,20 +209,7 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, */ index = (offset - EXTIOI_IPMAP_START) >> 2; s->ipmap[index] = val; - /* - * loongarch only support little endian, - * so we paresd the value with little endian. - */ - val = cpu_to_le64(val); - for (i = 0; i < 4; i++) { - uint8_t ipnum; - ipnum = val & 0xff; - ipnum = ctz32(ipnum); - ipnum = (ipnum >= 4) ? 0 : ipnum; - s->sw_ipmap[index * 4 + i] = ipnum; - val = val >> 8; - } - + extioi_update_sw_ipmap(s, index, val); break; case EXTIOI_ENABLE_START ... EXTIOI_ENABLE_END - 1: index = (offset - EXTIOI_ENABLE_START) >> 2; @@ -189,8 +233,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, index = (offset - EXTIOI_COREISR_START) >> 2; /* using attrs to get current cpu index */ cpu = attrs.requester_id; - old_data = s->coreisr[cpu][index]; - s->coreisr[cpu][index] = old_data & ~val; + old_data = s->cpu[cpu].coreisr[index]; + s->cpu[cpu].coreisr[index] = old_data & ~val; /* write 1 to clear interrupt */ old_data &= val; irq = ctz32(old_data); @@ -204,33 +248,8 @@ static MemTxResult extioi_writew(void *opaque, hwaddr addr, irq = offset - EXTIOI_COREMAP_START; index = irq / 4; s->coremap[index] = val; - /* - * loongarch only support little endian, - * so we paresd the value with little endian. - */ - val = cpu_to_le64(val); - for (i = 0; i < 4; i++) { - cpu = val & 0xff; - cpu = ctz32(cpu); - cpu = (cpu >= 4) ? 0 : cpu; - val = val >> 8; - - if (s->sw_coremap[irq + i] == cpu) { - continue; - } - - if (test_bit(irq, (unsigned long *)s->isr)) { - /* - * lower irq at old cpu and raise irq at new cpu - */ - extioi_update_irq(s, irq + i, 0); - s->sw_coremap[irq + i] = cpu; - extioi_update_irq(s, irq + i, 1); - } else { - s->sw_coremap[irq + i] = cpu; - } - } + extioi_update_sw_coremap(s, irq, val, true); break; default: break; @@ -248,65 +267,192 @@ static const MemoryRegionOps extioi_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -static const VMStateDescription vmstate_loongarch_extioi = { - .name = TYPE_LOONGARCH_EXTIOI, - .version_id = 1, - .minimum_version_id = 1, - .fields = (VMStateField[]) { - VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT), - VMSTATE_UINT32_2DARRAY(coreisr, LoongArchExtIOI, EXTIOI_CPUS, - EXTIOI_IRQS_GROUP_COUNT), - VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI, - EXTIOI_IRQS_NODETYPE_COUNT / 2), - VMSTATE_UINT32_ARRAY(enable, LoongArchExtIOI, EXTIOI_IRQS / 32), - VMSTATE_UINT32_ARRAY(isr, LoongArchExtIOI, EXTIOI_IRQS / 32), - VMSTATE_UINT32_ARRAY(ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE / 4), - VMSTATE_UINT32_ARRAY(coremap, LoongArchExtIOI, EXTIOI_IRQS / 4), - VMSTATE_UINT8_ARRAY(sw_ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE), - VMSTATE_UINT8_ARRAY(sw_coremap, LoongArchExtIOI, EXTIOI_IRQS), +static MemTxResult extioi_virt_readw(void *opaque, hwaddr addr, uint64_t *data, + unsigned size, MemTxAttrs attrs) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); - VMSTATE_END_OF_LIST() + switch (addr) { + case EXTIOI_VIRT_FEATURES: + *data = s->features; + break; + case EXTIOI_VIRT_CONFIG: + *data = s->status; + break; + default: + break; + } + + return MEMTX_OK; +} + +static MemTxResult extioi_virt_writew(void *opaque, hwaddr addr, + uint64_t val, unsigned size, + MemTxAttrs attrs) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); + + switch (addr) { + case EXTIOI_VIRT_FEATURES: + return MEMTX_ACCESS_ERROR; + + case EXTIOI_VIRT_CONFIG: + /* + * extioi features can only be set at disabled status + */ + if ((s->status & BIT(EXTIOI_ENABLE)) && val) { + return MEMTX_ACCESS_ERROR; + } + + s->status = val & s->features; + break; + default: + break; } + return MEMTX_OK; +} + +static const MemoryRegionOps extioi_virt_ops = { + .read_with_attrs = extioi_virt_readw, + .write_with_attrs = extioi_virt_writew, + .impl.min_access_size = 4, + .impl.max_access_size = 4, + .valid.min_access_size = 4, + .valid.max_access_size = 8, + .endianness = DEVICE_LITTLE_ENDIAN, }; -static void loongarch_extioi_instance_init(Object *obj) +static void loongarch_extioi_realize(DeviceState *dev, Error **errp) { - SysBusDevice *dev = SYS_BUS_DEVICE(obj); - LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj); - int i, cpu, pin; + LoongArchExtIOI *s = LOONGARCH_EXTIOI(dev); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + int i, pin; + + if (s->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } for (i = 0; i < EXTIOI_IRQS; i++) { - sysbus_init_irq(dev, &s->irq[i]); + sysbus_init_irq(sbd, &s->irq[i]); } - qdev_init_gpio_in(DEVICE(obj), extioi_setirq, EXTIOI_IRQS); + qdev_init_gpio_in(dev, extioi_setirq, EXTIOI_IRQS); + memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, + s, "extioi_system_mem", 0x900); + sysbus_init_mmio(sbd, &s->extioi_system_mem); + + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + memory_region_init_io(&s->virt_extend, OBJECT(s), &extioi_virt_ops, + s, "extioi_virt", EXTIOI_VIRT_SIZE); + sysbus_init_mmio(sbd, &s->virt_extend); + s->features |= EXTIOI_VIRT_HAS_FEATURES; + } else { + s->status |= BIT(EXTIOI_ENABLE); + } - for (cpu = 0; cpu < EXTIOI_CPUS; cpu++) { - memory_region_init_io(&s->extioi_iocsr_mem[cpu], OBJECT(s), &extioi_ops, - s, "extioi_iocsr", 0x900); - sysbus_init_mmio(dev, &s->extioi_iocsr_mem[cpu]); + s->cpu = g_new0(ExtIOICore, s->num_cpu); + if (s->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + for (i = 0; i < s->num_cpu; i++) { for (pin = 0; pin < LS3A_INTC_IP; pin++) { - qdev_init_gpio_out(DEVICE(obj), &s->parent_irq[cpu][pin], 1); + qdev_init_gpio_out(dev, &s->cpu[i].parent_irq[pin], 1); } } - memory_region_init_io(&s->extioi_system_mem, OBJECT(s), &extioi_ops, - s, "extioi_system_mem", 0x900); - sysbus_init_mmio(dev, &s->extioi_system_mem); } +static void loongarch_extioi_finalize(Object *obj) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(obj); + + g_free(s->cpu); +} + +static void loongarch_extioi_reset(DeviceState *d) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(d); + + /* use legacy interrupt routing method by default */ + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + s->status = 0; + } +} + +static int vmstate_extioi_post_load(void *opaque, int version_id) +{ + LoongArchExtIOI *s = LOONGARCH_EXTIOI(opaque); + int i, start_irq; + + for (i = 0; i < (EXTIOI_IRQS / 4); i++) { + start_irq = i * 4; + extioi_update_sw_coremap(s, start_irq, s->coremap[i], false); + } + + for (i = 0; i < (EXTIOI_IRQS_IPMAP_SIZE / 4); i++) { + extioi_update_sw_ipmap(s, i, s->ipmap[i]); + } + + return 0; +} + +static const VMStateDescription vmstate_extioi_core = { + .name = "extioi-core", + .version_id = 1, + .minimum_version_id = 1, + .fields = (const VMStateField[]) { + VMSTATE_UINT32_ARRAY(coreisr, ExtIOICore, EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_loongarch_extioi = { + .name = TYPE_LOONGARCH_EXTIOI, + .version_id = 2, + .minimum_version_id = 2, + .post_load = vmstate_extioi_post_load, + .fields = (const VMStateField[]) { + VMSTATE_UINT32_ARRAY(bounce, LoongArchExtIOI, EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_UINT32_ARRAY(nodetype, LoongArchExtIOI, + EXTIOI_IRQS_NODETYPE_COUNT / 2), + VMSTATE_UINT32_ARRAY(enable, LoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_ARRAY(isr, LoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_ARRAY(ipmap, LoongArchExtIOI, EXTIOI_IRQS_IPMAP_SIZE / 4), + VMSTATE_UINT32_ARRAY(coremap, LoongArchExtIOI, EXTIOI_IRQS / 4), + + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchExtIOI, num_cpu, + vmstate_extioi_core, ExtIOICore), + VMSTATE_UINT32(features, LoongArchExtIOI), + VMSTATE_UINT32(status, LoongArchExtIOI), + VMSTATE_END_OF_LIST() + } +}; + +static Property extioi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", LoongArchExtIOI, num_cpu, 1), + DEFINE_PROP_BIT("has-virtualization-extension", LoongArchExtIOI, features, + EXTIOI_HAS_VIRT_EXTENSION, 0), + DEFINE_PROP_END_OF_LIST(), +}; + static void loongarch_extioi_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + dc->realize = loongarch_extioi_realize; + dc->reset = loongarch_extioi_reset; + device_class_set_props(dc, extioi_properties); dc->vmsd = &vmstate_loongarch_extioi; } static const TypeInfo loongarch_extioi_info = { .name = TYPE_LOONGARCH_EXTIOI, .parent = TYPE_SYS_BUS_DEVICE, - .instance_init = loongarch_extioi_instance_init, .instance_size = sizeof(struct LoongArchExtIOI), .class_init = loongarch_extioi_class_init, + .instance_finalize = loongarch_extioi_finalize, }; static void loongarch_extioi_register_types(void) diff --git a/hw/intc/loongarch_extioi_kvm.c b/hw/intc/loongarch_extioi_kvm.c new file mode 100644 index 0000000000000000000000000000000000000000..94af4378e466ff72602985c4dc7b92da76fdee70 --- /dev/null +++ b/hw/intc/loongarch_extioi_kvm.c @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm extioi interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_extioi.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" + +static void kvm_extioi_access_regs(int fd, uint64_t addr, + void *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static void kvm_extioi_access_sw_status(int fd, uint64_t addr, + void *val, bool is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS, + addr, val, is_write, &error_abort); +} + +static void kvm_extioi_save_load_sw_status(void *opaque, bool is_write) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + int addr; + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->num_cpu, is_write); + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->features, is_write); + + addr = KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE; + kvm_extioi_access_sw_status(fd, addr, (void *)&s->status, is_write); +} + +static int kvm_loongarch_extioi_pre_save(void *opaque) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START, + (void *)s->nodetype, false); + kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, false); + kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, false); + kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, false); + kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, false); + kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START, + (void *)s->coremap, false); + kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG, + (void *)s->sw_coremap, false); + kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, + (void *)s->coreisr, false); + + kvm_extioi_save_load_sw_status(opaque, false); + + return 0; +} + +static int kvm_loongarch_extioi_post_load(void *opaque, int version_id) +{ + KVMLoongArchExtIOI *s = (KVMLoongArchExtIOI *)opaque; + KVMLoongArchExtIOIClass *class = KVM_LOONGARCH_EXTIOI_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_extioi_access_regs(fd, EXTIOI_NODETYPE_START, + (void *)s->nodetype, true); + kvm_extioi_access_regs(fd, EXTIOI_IPMAP_START, (void *)s->ipmap, true); + kvm_extioi_access_regs(fd, EXTIOI_ENABLE_START, (void *)s->enable, true); + kvm_extioi_access_regs(fd, EXTIOI_BOUNCE_START, (void *)s->bounce, true); + kvm_extioi_access_regs(fd, EXTIOI_ISR_START, (void *)s->isr, true); + kvm_extioi_access_regs(fd, EXTIOI_COREMAP_START, (void *)s->coremap, true); + kvm_extioi_access_regs(fd, EXTIOI_SW_COREMAP_FLAG, + (void *)s->sw_coremap, true); + kvm_extioi_access_regs(fd, EXTIOI_COREISR_START, (void *)s->coreisr, true); + + kvm_extioi_save_load_sw_status(opaque, true); + + kvm_device_access(fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED, + NULL, true, &error_abort); + + return 0; +} + +static void kvm_loongarch_extioi_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_GET_CLASS(dev); + KVMLoongArchExtIOI *s = KVM_LOONGARCH_EXTIOI(dev); + struct kvm_create_device cd = {0}; + Error *err = NULL; + int ret,i; + + extioi_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (s->features & BIT(EXTIOI_HAS_VIRT_EXTENSION)) { + s->features |= EXTIOI_VIRT_HAS_FEATURES; + } + + if (!extioi_class->is_created) { + cd.type = KVM_DEV_TYPE_LOONGARCH_EIOINTC; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, + "Creating the KVM extioi device failed"); + return; + } + extioi_class->is_created = true; + extioi_class->dev_fd = cd.fd; + + kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU, + &s->num_cpu, true, NULL); + + kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL, + KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE, + &s->features, true, NULL); + + fprintf(stdout, "Create LoongArch extioi irqchip in KVM done!\n"); + } + + kvm_async_interrupts_allowed = true; + kvm_msi_via_irqfd_allowed = kvm_irqfds_enabled(); + if (kvm_has_gsi_routing()) { + for (i = 0; i < 64; ++i) { + kvm_irqchip_add_irq_route(kvm_state, i, 0, i); + } + kvm_gsi_routing_allowed = true; + } +} + +static const VMStateDescription vmstate_kvm_extioi_core = { + .name = "kvm-extioi-single", + .version_id = 2, + .minimum_version_id = 2, + .pre_save = kvm_loongarch_extioi_pre_save, + .post_load = kvm_loongarch_extioi_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(nodetype, KVMLoongArchExtIOI, + EXTIOI_IRQS_NODETYPE_COUNT / 2), + VMSTATE_UINT32_ARRAY(bounce, KVMLoongArchExtIOI, + EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_UINT32_ARRAY(isr, KVMLoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_2DARRAY(coreisr, KVMLoongArchExtIOI, EXTIOI_CPUS, + EXTIOI_IRQS_GROUP_COUNT), + VMSTATE_UINT32_ARRAY(enable, KVMLoongArchExtIOI, EXTIOI_IRQS / 32), + VMSTATE_UINT32_ARRAY(ipmap, KVMLoongArchExtIOI, + EXTIOI_IRQS_IPMAP_SIZE / 4), + VMSTATE_UINT32_ARRAY(coremap, KVMLoongArchExtIOI, EXTIOI_IRQS / 4), + VMSTATE_UINT8_ARRAY(sw_coremap, KVMLoongArchExtIOI, EXTIOI_IRQS), + VMSTATE_UINT32(num_cpu, KVMLoongArchExtIOI), + VMSTATE_UINT32(features, KVMLoongArchExtIOI), + VMSTATE_UINT32(status, KVMLoongArchExtIOI), + VMSTATE_END_OF_LIST() + } +}; + +static Property extioi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", KVMLoongArchExtIOI, num_cpu, 1), + DEFINE_PROP_BIT("has-virtualization-extension", KVMLoongArchExtIOI, + features, EXTIOI_HAS_VIRT_EXTENSION, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void kvm_loongarch_extioi_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchExtIOIClass *extioi_class = KVM_LOONGARCH_EXTIOI_CLASS(oc); + + extioi_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_extioi_realize; + extioi_class->is_created = false; + device_class_set_props(dc, extioi_properties); + dc->vmsd = &vmstate_kvm_extioi_core; +} + +static const TypeInfo kvm_loongarch_extioi_info = { + .name = TYPE_KVM_LOONGARCH_EXTIOI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchExtIOI), + .class_size = sizeof(KVMLoongArchExtIOIClass), + .class_init = kvm_loongarch_extioi_class_init, +}; + +static void kvm_loongarch_extioi_register_types(void) +{ + type_register_static(&kvm_loongarch_extioi_info); +} + +type_init(kvm_loongarch_extioi_register_types) diff --git a/hw/intc/loongarch_ipi.c b/hw/intc/loongarch_ipi.c index 67858b521c6a4ce68c1bc26b7ac6fba9e083db06..630bcb14ea7fc8b1b7c78838f5afd98e613db509 100644 --- a/hw/intc/loongarch_ipi.c +++ b/hw/intc/loongarch_ipi.c @@ -9,22 +9,26 @@ #include "hw/sysbus.h" #include "hw/intc/loongarch_ipi.h" #include "hw/irq.h" +#include "hw/qdev-properties.h" #include "qapi/error.h" #include "qemu/log.h" #include "exec/address-spaces.h" #include "hw/loongarch/virt.h" #include "migration/vmstate.h" +#include "target/loongarch/cpu.h" #include "target/loongarch/internals.h" #include "trace.h" -static void loongarch_ipi_writel(void *, hwaddr, uint64_t, unsigned); - -static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size) +static MemTxResult loongarch_ipi_readl(void *opaque, hwaddr addr, + uint64_t *data, + unsigned size, MemTxAttrs attrs) { - IPICore *s = opaque; + IPICore *s; + LoongArchIPI *ipi = opaque; uint64_t ret = 0; int index = 0; + s = &ipi->cpu[attrs.requester_id]; addr &= 0xff; switch (addr) { case CORE_STATUS_OFF: @@ -49,10 +53,12 @@ static uint64_t loongarch_ipi_readl(void *opaque, hwaddr addr, unsigned size) } trace_loongarch_ipi_read(size, (uint64_t)addr, ret); - return ret; + *data = ret; + return MEMTX_OK; } -static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) +static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr, + MemTxAttrs attrs) { int i, mask = 0, data = 0; @@ -61,8 +67,8 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) * if the mask is 0, we need not to do anything. */ if ((val >> 27) & 0xf) { - data = address_space_ldl(&env->address_space_iocsr, addr, - MEMTXATTRS_UNSPECIFIED, NULL); + data = address_space_ldl(env->address_space_iocsr, addr, + attrs, NULL); for (i = 0; i < 4; i++) { /* get mask for byte writing */ if (val & (0x1 << (27 + i))) { @@ -73,8 +79,8 @@ static void send_ipi_data(CPULoongArchState *env, uint64_t val, hwaddr addr) data &= mask; data |= (val >> 32) & ~mask; - address_space_stl(&env->address_space_iocsr, addr, - data, MEMTXATTRS_UNSPECIFIED, NULL); + address_space_stl(env->address_space_iocsr, addr, + data, attrs, NULL); } static int archid_cmp(const void *a, const void *b) @@ -103,80 +109,72 @@ static CPUState *ipi_getcpu(int arch_id) CPUArchId *archid; archid = find_cpu_by_archid(machine, arch_id); - return CPU(archid->cpu); -} - -static void ipi_send(uint64_t val) -{ - uint32_t cpuid; - uint8_t vector; - CPUState *cs; - LoongArchCPU *cpu; - LoongArchIPI *s; - - cpuid = extract32(val, 16, 10); - if (cpuid >= LOONGARCH_MAX_CPUS) { - trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid); - return; + if (archid) { + return CPU(archid->cpu); } - /* IPI status vector */ - vector = extract8(val, 0, 5); - - cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - s = LOONGARCH_IPI(cpu->env.ipistate); - loongarch_ipi_writel(&s->ipi_core, CORE_SET_OFF, BIT(vector), 4); + return NULL; } -static void mail_send(uint64_t val) +static MemTxResult mail_send(uint64_t val, MemTxAttrs attrs) { uint32_t cpuid; hwaddr addr; - CPULoongArchState *env; CPUState *cs; - LoongArchCPU *cpu; cpuid = extract32(val, 16, 10); if (cpuid >= LOONGARCH_MAX_CPUS) { trace_loongarch_ipi_unsupported_cpuid("IOCSR_MAIL_SEND", cpuid); - return; + return MEMTX_DECODE_ERROR; } - addr = 0x1020 + (val & 0x1c); cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - env = &cpu->env; - send_ipi_data(env, val, addr); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + addr = SMP_IPI_MAILBOX + CORE_BUF_20 + (val & 0x1c); + attrs.requester_id = cs->cpu_index; + send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs); + return MEMTX_OK; } -static void any_send(uint64_t val) +static MemTxResult any_send(uint64_t val, MemTxAttrs attrs) { uint32_t cpuid; hwaddr addr; - CPULoongArchState *env; CPUState *cs; - LoongArchCPU *cpu; cpuid = extract32(val, 16, 10); if (cpuid >= LOONGARCH_MAX_CPUS) { trace_loongarch_ipi_unsupported_cpuid("IOCSR_ANY_SEND", cpuid); - return; + return MEMTX_DECODE_ERROR; } - addr = val & 0xffff; cs = ipi_getcpu(cpuid); - cpu = LOONGARCH_CPU(cs); - env = &cpu->env; - send_ipi_data(env, val, addr); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + addr = val & 0xffff; + attrs.requester_id = cs->cpu_index; + send_ipi_data(&LOONGARCH_CPU(cs)->env, val, addr, attrs); + return MEMTX_OK; } -static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, - unsigned size) +static MemTxResult loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { - IPICore *s = opaque; + LoongArchIPI *ipi = opaque; + IPICore *s; int index = 0; + uint32_t cpuid; + uint8_t vector; + CPUState *cs; + s = &ipi->cpu[attrs.requester_id]; addr &= 0xff; trace_loongarch_ipi_write(size, (uint64_t)addr, val); switch (addr) { @@ -203,17 +201,34 @@ static void loongarch_ipi_writel(void *opaque, hwaddr addr, uint64_t val, s->buf[index] = val; break; case IOCSR_IPI_SEND: - ipi_send(val); + cpuid = extract32(val, 16, 10); + if (cpuid >= LOONGARCH_MAX_CPUS) { + trace_loongarch_ipi_unsupported_cpuid("IOCSR_IPI_SEND", cpuid); + return MEMTX_DECODE_ERROR; + } + + /* IPI status vector */ + vector = extract8(val, 0, 5); + cs = ipi_getcpu(cpuid); + if (cs == NULL) { + return MEMTX_DECODE_ERROR; + } + + /* override requester_id */ + attrs.requester_id = cs->cpu_index; + loongarch_ipi_writel(ipi, CORE_SET_OFF, BIT(vector), 4, attrs); break; default: qemu_log_mask(LOG_UNIMP, "invalid write: %x", (uint32_t)addr); break; } + + return MEMTX_OK; } static const MemoryRegionOps loongarch_ipi_ops = { - .read = loongarch_ipi_readl, - .write = loongarch_ipi_writel, + .read_with_attrs = loongarch_ipi_readl, + .write_with_attrs = loongarch_ipi_writel, .impl.min_access_size = 4, .impl.max_access_size = 4, .valid.min_access_size = 4, @@ -222,24 +237,28 @@ static const MemoryRegionOps loongarch_ipi_ops = { }; /* mail send and any send only support writeq */ -static void loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val, - unsigned size) +static MemTxResult loongarch_ipi_writeq(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) { + MemTxResult ret = MEMTX_OK; + addr &= 0xfff; switch (addr) { case MAIL_SEND_OFFSET: - mail_send(val); + ret = mail_send(val, attrs); break; case ANY_SEND_OFFSET: - any_send(val); + ret = any_send(val, attrs); break; default: break; } + + return ret; } static const MemoryRegionOps loongarch_ipi64_ops = { - .write = loongarch_ipi_writeq, + .write_with_attrs = loongarch_ipi_writeq, .impl.min_access_size = 8, .impl.max_access_size = 8, .valid.min_access_size = 8, @@ -247,23 +266,39 @@ static const MemoryRegionOps loongarch_ipi64_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; -static void loongarch_ipi_init(Object *obj) +static void loongarch_ipi_realize(DeviceState *dev, Error **errp) { - LoongArchIPI *s = LOONGARCH_IPI(obj); - SysBusDevice *sbd = SYS_BUS_DEVICE(obj); + LoongArchIPI *s = LOONGARCH_IPI(dev); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + int i; + + if (s->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } - memory_region_init_io(&s->ipi_iocsr_mem, obj, &loongarch_ipi_ops, - &s->ipi_core, "loongarch_ipi_iocsr", 0x48); + memory_region_init_io(&s->ipi_iocsr_mem, OBJECT(dev), &loongarch_ipi_ops, + s, "loongarch_ipi_iocsr", 0x48); /* loongarch_ipi_iocsr performs re-entrant IO through ipi_send */ s->ipi_iocsr_mem.disable_reentrancy_guard = true; sysbus_init_mmio(sbd, &s->ipi_iocsr_mem); - memory_region_init_io(&s->ipi64_iocsr_mem, obj, &loongarch_ipi64_ops, - &s->ipi_core, "loongarch_ipi64_iocsr", 0x118); + memory_region_init_io(&s->ipi64_iocsr_mem, OBJECT(dev), + &loongarch_ipi64_ops, + s, "loongarch_ipi64_iocsr", 0x118); sysbus_init_mmio(sbd, &s->ipi64_iocsr_mem); - qdev_init_gpio_out(DEVICE(obj), &s->ipi_core.irq, 1); + + s->cpu = g_new0(IPICore, s->num_cpu); + if (s->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + for (i = 0; i < s->num_cpu; i++) { + qdev_init_gpio_out(dev, &s->cpu[i].irq, 1); + } } static const VMStateDescription vmstate_ipi_core = { @@ -282,27 +317,42 @@ static const VMStateDescription vmstate_ipi_core = { static const VMStateDescription vmstate_loongarch_ipi = { .name = TYPE_LOONGARCH_IPI, - .version_id = 1, - .minimum_version_id = 1, - .fields = (VMStateField[]) { - VMSTATE_STRUCT(ipi_core, LoongArchIPI, 0, vmstate_ipi_core, IPICore), + .version_id = 2, + .minimum_version_id = 2, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, LoongArchIPI, num_cpu, + vmstate_ipi_core, IPICore), VMSTATE_END_OF_LIST() } }; +static Property ipi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", LoongArchIPI, num_cpu, 1), + DEFINE_PROP_END_OF_LIST(), +}; + static void loongarch_ipi_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + dc->realize = loongarch_ipi_realize; + device_class_set_props(dc, ipi_properties); dc->vmsd = &vmstate_loongarch_ipi; } +static void loongarch_ipi_finalize(Object *obj) +{ + LoongArchIPI *s = LOONGARCH_IPI(obj); + + g_free(s->cpu); +} + static const TypeInfo loongarch_ipi_info = { .name = TYPE_LOONGARCH_IPI, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(LoongArchIPI), - .instance_init = loongarch_ipi_init, .class_init = loongarch_ipi_class_init, + .instance_finalize = loongarch_ipi_finalize, }; static void loongarch_ipi_register_types(void) diff --git a/hw/intc/loongarch_ipi_kvm.c b/hw/intc/loongarch_ipi_kvm.c new file mode 100644 index 0000000000000000000000000000000000000000..57fc05db778ddbfbcb02845801351525cadcddaf --- /dev/null +++ b/hw/intc/loongarch_ipi_kvm.c @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm ipi interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_ipi.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" + +#define IPI_DEV_FD_UNDEF -1 + +static void kvm_ipi_access_regs(int fd, uint64_t addr, + uint32_t *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_IPI_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static int kvm_loongarch_ipi_pre_save(void *opaque) +{ + KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque; + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi); + IPICore *cpu; + uint64_t attr; + int cpu_id = 0; + int fd = ipi_class->dev_fd; + + for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) { + cpu = &ipi->cpu[cpu_id]; + attr = (cpu_id << 16) | CORE_STATUS_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->status, false); + + attr = (cpu_id << 16) | CORE_EN_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->en, false); + + attr = (cpu_id << 16) | CORE_SET_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->set, false); + + attr = (cpu_id << 16) | CORE_CLEAR_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->clear, false); + + attr = (cpu_id << 16) | CORE_BUF_20; + kvm_ipi_access_regs(fd, attr, &cpu->buf[0], false); + + attr = (cpu_id << 16) | CORE_BUF_28; + kvm_ipi_access_regs(fd, attr, &cpu->buf[2], false); + + attr = (cpu_id << 16) | CORE_BUF_30; + kvm_ipi_access_regs(fd, attr, &cpu->buf[4], false); + + attr = (cpu_id << 16) | CORE_BUF_38; + kvm_ipi_access_regs(fd, attr, &cpu->buf[6], false); + } + + return 0; +} + +static int kvm_loongarch_ipi_post_load(void *opaque, int version_id) +{ + KVMLoongArchIPI *ipi = (KVMLoongArchIPI *)opaque; + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(ipi); + IPICore *cpu; + uint64_t attr; + int cpu_id = 0; + int fd = ipi_class->dev_fd; + + for (cpu_id = 0; cpu_id < ipi->num_cpu; cpu_id++) { + cpu = &ipi->cpu[cpu_id]; + attr = (cpu_id << 16) | CORE_STATUS_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->status, true); + + attr = (cpu_id << 16) | CORE_EN_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->en, true); + + attr = (cpu_id << 16) | CORE_SET_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->set, true); + + attr = (cpu_id << 16) | CORE_CLEAR_OFF; + kvm_ipi_access_regs(fd, attr, &cpu->clear, true); + + attr = (cpu_id << 16) | CORE_BUF_20; + kvm_ipi_access_regs(fd, attr, &cpu->buf[0], true); + + attr = (cpu_id << 16) | CORE_BUF_28; + kvm_ipi_access_regs(fd, attr, &cpu->buf[2], true); + + attr = (cpu_id << 16) | CORE_BUF_30; + kvm_ipi_access_regs(fd, attr, &cpu->buf[4], true); + + attr = (cpu_id << 16) | CORE_BUF_38; + kvm_ipi_access_regs(fd, attr, &cpu->buf[6], true); + } + + return 0; +} + +static void kvm_loongarch_ipi_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchIPI *ipi = KVM_LOONGARCH_IPI(dev); + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_GET_CLASS(dev); + struct kvm_create_device cd = {0}; + Error *err = NULL; + int ret; + + if (ipi->num_cpu == 0) { + error_setg(errp, "num-cpu must be at least 1"); + return; + } + + ipi_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + ipi->cpu = g_new0(IPICore, ipi->num_cpu); + if (ipi->cpu == NULL) { + error_setg(errp, "Memory allocation for ExtIOICore faile"); + return; + } + + if (!ipi_class->is_created) { + cd.type = KVM_DEV_TYPE_LOONGARCH_IPI; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, "Creating the KVM device failed"); + return; + } + ipi_class->is_created = true; + ipi_class->dev_fd = cd.fd; + fprintf(stdout, "Create LoongArch IPI irqchip in KVM done!\n"); + } + + assert(ipi_class->dev_fd != IPI_DEV_FD_UNDEF); +} + +static Property kvm_loongarch_ipi_properties[] = { + DEFINE_PROP_UINT32("num-cpu", KVMLoongArchIPI, num_cpu, 1), + DEFINE_PROP_END_OF_LIST() +}; + +static const VMStateDescription vmstate_kvm_ipi_core = { + .name = "kvm-ipi-single", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(status, IPICore), + VMSTATE_UINT32(en, IPICore), + VMSTATE_UINT32(set, IPICore), + VMSTATE_UINT32(clear, IPICore), + VMSTATE_UINT32_ARRAY(buf, IPICore, 8), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_kvm_loongarch_ipi = { + .name = TYPE_KVM_LOONGARCH_IPI, + .version_id = 1, + .minimum_version_id = 1, + .pre_save = kvm_loongarch_ipi_pre_save, + .post_load = kvm_loongarch_ipi_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, KVMLoongArchIPI, num_cpu, + vmstate_kvm_ipi_core, IPICore), + + VMSTATE_END_OF_LIST() + } +}; + +static void kvm_loongarch_ipi_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchIPIClass *ipi_class = KVM_LOONGARCH_IPI_CLASS(oc); + + ipi_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_ipi_realize; + + ipi_class->is_created = false; + ipi_class->dev_fd = IPI_DEV_FD_UNDEF; + + device_class_set_props(dc, kvm_loongarch_ipi_properties); + + dc->vmsd = &vmstate_kvm_loongarch_ipi; +} + +static const TypeInfo kvm_loongarch_ipi_info = { + .name = TYPE_KVM_LOONGARCH_IPI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchIPI), + .class_size = sizeof(KVMLoongArchIPIClass), + .class_init = kvm_loongarch_ipi_class_init, +}; + +static void kvm_loongarch_ipi_register_types(void) +{ + type_register_static(&kvm_loongarch_ipi_info); +} + +type_init(kvm_loongarch_ipi_register_types) diff --git a/hw/intc/loongarch_pch_msi.c b/hw/intc/loongarch_pch_msi.c index ecf3ed0267e4fb79e0614fcdc941b7e121e2e644..901c2c21bef33d6d46ff5347e755d9d716cb3d8b 100644 --- a/hw/intc/loongarch_pch_msi.c +++ b/hw/intc/loongarch_pch_msi.c @@ -14,6 +14,8 @@ #include "hw/misc/unimp.h" #include "migration/vmstate.h" #include "trace.h" +#include "sysemu/kvm.h" +#include "hw/loongarch/virt.h" static uint64_t loongarch_msi_mem_read(void *opaque, hwaddr addr, unsigned size) { @@ -26,14 +28,24 @@ static void loongarch_msi_mem_write(void *opaque, hwaddr addr, LoongArchPCHMSI *s = (LoongArchPCHMSI *)opaque; int irq_num; - /* - * vector number is irq number from upper extioi intc - * need subtract irq base to get msi vector offset - */ - irq_num = (val & 0xff) - s->irq_base; - trace_loongarch_msi_set_irq(irq_num); - assert(irq_num < s->irq_num); - qemu_set_irq(s->pch_msi_irq[irq_num], 1); + MSIMessage msg = { + .address = addr, + .data = val, + }; + + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irqchip_send_msi(kvm_state, msg); + } else { + /* + * vector number is irq number from upper extioi intc + * need subtract irq base to get msi vector offset + */ + irq_num = (val & 0xff) - s->irq_base; + trace_loongarch_msi_set_irq(irq_num); + assert(irq_num < s->irq_num); + + qemu_set_irq(s->pch_msi_irq[irq_num], 1); + } } static const MemoryRegionOps loongarch_pch_msi_ops = { @@ -46,7 +58,16 @@ static void pch_msi_irq_handler(void *opaque, int irq, int level) { LoongArchPCHMSI *s = LOONGARCH_PCH_MSI(opaque); - qemu_set_irq(s->pch_msi_irq[irq], level); + MSIMessage msg = { + .address = 0, + .data = irq, + }; + + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irqchip_send_msi(kvm_state, msg); + } else { + qemu_set_irq(s->pch_msi_irq[irq], level); + } } static void loongarch_pch_msi_realize(DeviceState *dev, Error **errp) diff --git a/hw/intc/loongarch_pch_pic.c b/hw/intc/loongarch_pch_pic.c index 6aa4cadfa4afd936d61a96e600b31947a62bb9b9..beb4ac188d8f97912856194bdfa00dfc90b00214 100644 --- a/hw/intc/loongarch_pch_pic.c +++ b/hw/intc/loongarch_pch_pic.c @@ -16,19 +16,28 @@ #include "migration/vmstate.h" #include "trace.h" #include "qapi/error.h" +#include "sysemu/kvm.h" static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level) { uint64_t val; int irq; + int kvm_irq; if (level) { val = mask & s->intirr & ~s->int_mask; if (val) { irq = ctz64(val); s->intisr |= MAKE_64BIT_MASK(irq, 1); - qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1); - } + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irq = ( + KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq]; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } else { + qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 1); + } + } } else { /* * intirr means requested pending irq @@ -38,8 +47,15 @@ static void pch_pic_update_irq(LoongArchPCHPIC *s, uint64_t mask, int level) if (val) { irq = ctz64(val); s->intisr &= ~MAKE_64BIT_MASK(irq, 1); - qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0); - } + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_irq = ( + KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | s->htmsi_vector[irq]; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } else { + qemu_set_irq(s->parent_irq[s->htmsi_vector[irq]], 0); + } + } } } diff --git a/hw/intc/loongarch_pch_pic_kvm.c b/hw/intc/loongarch_pch_pic_kvm.c new file mode 100644 index 0000000000000000000000000000000000000000..e9cef02f9aacb9e9dbed5b173587e573fd012bcb --- /dev/null +++ b/hw/intc/loongarch_pch_pic_kvm.c @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch kvm pch pic interrupt support + * + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "qemu/typedefs.h" +#include "hw/intc/loongarch_pch_pic.h" +#include "hw/sysbus.h" +#include "linux/kvm.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" +#include "hw/loongarch/virt.h" +#include "hw/pci-host/ls7a.h" +#include "qemu/error-report.h" + +static void kvm_pch_pic_access_regs(int fd, uint64_t addr, + void *val, int is_write) +{ + kvm_device_access(fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS, + addr, val, is_write, &error_abort); +} + +static int kvm_loongarch_pch_pic_pre_save(void *opaque) +{ + KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque; + KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START, + (void *)&s->int_mask, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START, + (void *)&s->htmsi_en, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START, + (void *)&s->intedge, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START, + (void *)&s->auto_crtl0, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START, + (void *)&s->auto_crtl1, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START, + (void *)s->route_entry, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START, + (void *)s->htmsi_vector, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START, + (void *)&s->intirr, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START, + (void *)&s->intisr, false); + kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START, + (void *)&s->int_polarity, false); + + return 0; +} + +static int kvm_loongarch_pch_pic_post_load(void *opaque, int version_id) +{ + KVMLoongArchPCHPIC *s = (KVMLoongArchPCHPIC *)opaque; + KVMLoongArchPCHPICClass *class = KVM_LOONGARCH_PCH_PIC_GET_CLASS(s); + int fd = class->dev_fd; + + kvm_pch_pic_access_regs(fd, PCH_PIC_MASK_START, + (void *)&s->int_mask, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_EN_START, + (void *)&s->htmsi_en, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_EDGE_START, + (void *)&s->intedge, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL0_START, + (void *)&s->auto_crtl0, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_AUTO_CTRL1_START, + (void *)&s->auto_crtl1, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_ROUTE_ENTRY_START, + (void *)s->route_entry, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_HTMSI_VEC_START, + (void *)s->htmsi_vector, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_IRR_START, + (void *)&s->intirr, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_INT_ISR_START, + (void *)&s->intisr, true); + kvm_pch_pic_access_regs(fd, PCH_PIC_POLARITY_START, + (void *)&s->int_polarity, true); + + return 0; +} + +static void kvm_pch_pic_handler(void *opaque, int irq, int level) +{ + int kvm_irq; + + if (kvm_enabled()) { + kvm_irq = \ + (KVM_LOONGARCH_IRQ_TYPE_IOAPIC << KVM_LOONGARCH_IRQ_TYPE_SHIFT) + | (0 << KVM_LOONGARCH_IRQ_VCPU_SHIFT) | irq; + kvm_set_irq(kvm_state, kvm_irq, !!level); + } +} + +static void kvm_loongarch_pch_pic_realize(DeviceState *dev, Error **errp) +{ + KVMLoongArchPCHPICClass *pch_pic_class = + KVM_LOONGARCH_PCH_PIC_GET_CLASS(dev); + struct kvm_create_device cd = {0}; + uint64_t pch_pic_base = VIRT_PCH_REG_BASE; + Error *err = NULL; + int ret; + + pch_pic_class->parent_realize(dev, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (!pch_pic_class->is_created) { + cd.type = KVM_DEV_TYPE_LOONGARCH_PCHPIC; + ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd); + if (ret < 0) { + error_setg_errno(errp, errno, + "Creating the KVM pch pic device failed"); + return; + } + pch_pic_class->is_created = true; + pch_pic_class->dev_fd = cd.fd; + fprintf(stdout, "Create LoongArch pch pic irqchip in KVM done!\n"); + + ret = kvm_device_access(cd.fd, KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL, + KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT, + &pch_pic_base, true, NULL); + if (ret < 0) { + error_report( + "KVM EXTIOI: failed to set the base address of EXTIOI"); + exit(1); + } + + qdev_init_gpio_in(dev, kvm_pch_pic_handler, VIRT_PCH_PIC_IRQ_NUM); + } +} + +static const VMStateDescription vmstate_kvm_loongarch_pch_pic = { + .name = TYPE_LOONGARCH_PCH_PIC, + .version_id = 1, + .minimum_version_id = 1, + .pre_save = kvm_loongarch_pch_pic_pre_save, + .post_load = kvm_loongarch_pch_pic_post_load, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(int_mask, KVMLoongArchPCHPIC), + VMSTATE_UINT64(htmsi_en, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intedge, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intclr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(auto_crtl0, KVMLoongArchPCHPIC), + VMSTATE_UINT64(auto_crtl1, KVMLoongArchPCHPIC), + VMSTATE_UINT8_ARRAY(route_entry, KVMLoongArchPCHPIC, 64), + VMSTATE_UINT8_ARRAY(htmsi_vector, KVMLoongArchPCHPIC, 64), + VMSTATE_UINT64(last_intirr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intirr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(intisr, KVMLoongArchPCHPIC), + VMSTATE_UINT64(int_polarity, KVMLoongArchPCHPIC), + VMSTATE_END_OF_LIST() + } +}; + + +static void kvm_loongarch_pch_pic_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + KVMLoongArchPCHPICClass *pch_pic_class = KVM_LOONGARCH_PCH_PIC_CLASS(oc); + + pch_pic_class->parent_realize = dc->realize; + dc->realize = kvm_loongarch_pch_pic_realize; + pch_pic_class->is_created = false; + dc->vmsd = &vmstate_kvm_loongarch_pch_pic; + +} + +static const TypeInfo kvm_loongarch_pch_pic_info = { + .name = TYPE_KVM_LOONGARCH_PCH_PIC, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(KVMLoongArchPCHPIC), + .class_size = sizeof(KVMLoongArchPCHPICClass), + .class_init = kvm_loongarch_pch_pic_class_init, +}; + +static void kvm_loongarch_pch_pic_register_types(void) +{ + type_register_static(&kvm_loongarch_pch_pic_info); +} + +type_init(kvm_loongarch_pch_pic_register_types) diff --git a/hw/intc/meson.build b/hw/intc/meson.build index ed355941d174193633f1601d45208eddf8f4963d..49b450131556911967c3b7881041f48c1aca95b9 100644 --- a/hw/intc/meson.build +++ b/hw/intc/meson.build @@ -70,6 +70,9 @@ specific_ss.add(when: ['CONFIG_KVM', 'CONFIG_XIVE'], specific_ss.add(when: 'CONFIG_M68K_IRQC', if_true: files('m68k_irqc.c')) specific_ss.add(when: 'CONFIG_NIOS2_VIC', if_true: files('nios2_vic.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_IPI', if_true: files('loongarch_ipi.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_IPI_KVM', if_true: files('loongarch_ipi_kvm.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC', if_true: files('loongarch_pch_pic.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_MSI', if_true: files('loongarch_pch_msi.c')) specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI', if_true: files('loongarch_extioi.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_EXTIOI_KVM', if_true: files('loongarch_extioi_kvm.c')) +specific_ss.add(when: 'CONFIG_LOONGARCH_PCH_PIC_KVM', if_true: files('loongarch_pch_pic_kvm.c')) diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index a6f91d4bcdf7d154ef16507a6da8b1d18dd6504a..d74ec11af4adb3fb1f5855e9cc85465477a3223c 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -41,7 +41,6 @@ #include "hw/pci/msi.h" #include "qapi/error.h" #include "qemu/bitops.h" -#include "qapi/qmp/qerror.h" #include "qemu/module.h" #include "qemu/timer.h" #include "qemu/error-report.h" @@ -1032,13 +1031,14 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, s_IRQ = IRQ_get_next(opp, &dst->servicing); /* Check queued interrupts. */ n_IRQ = IRQ_get_next(opp, &dst->raised); - src = &opp->src[n_IRQ]; - if (n_IRQ != -1 && - (s_IRQ == -1 || - IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { - DPRINTF("Raise OpenPIC INT output cpu %d irq %d", - idx, n_IRQ); - qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + if (n_IRQ != -1) { + src = &opp->src[n_IRQ]; + if (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority) { + DPRINTF("Raise OpenPIC INT output cpu %d irq %d", + idx, n_IRQ); + qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + } } break; default: @@ -1535,9 +1535,7 @@ static void openpic_realize(DeviceState *dev, Error **errp) }; if (opp->nb_cpus > MAX_CPU) { - error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE, - TYPE_OPENPIC, "nb_cpus", (uint64_t)opp->nb_cpus, - (uint64_t)0, (uint64_t)MAX_CPU); + error_setg(errp, "property 'nb_cpus' can be at most %d", MAX_CPU); return; } diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c index c677b5cfbb54d3fdd2480063ba0f5edc0c67d379..2fdf85444e556c9ec30f9726cc5afe1394019c78 100644 --- a/hw/intc/riscv_aplic.c +++ b/hw/intc/riscv_aplic.c @@ -974,16 +974,16 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size, qdev_prop_set_bit(dev, "msimode", msimode); qdev_prop_set_bit(dev, "mmode", mmode); + if (parent) { + riscv_aplic_add_child(parent, dev); + } + sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); if (!is_kvm_aia(msimode)) { sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr); } - if (parent) { - riscv_aplic_add_child(parent, dev); - } - if (!msimode) { for (i = 0; i < num_harts; i++) { CPUState *cpu = cpu_by_arch_id(hartid_base + i); diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c index 5522ede2cf856f78be9107d2eee7e993960a76eb..e5de52bc44c99df0baa7b5cd7e6973c2f4832133 100644 --- a/hw/intc/sifive_plic.c +++ b/hw/intc/sifive_plic.c @@ -349,8 +349,10 @@ static void sifive_plic_irq_request(void *opaque, int irq, int level) { SiFivePLICState *s = opaque; - sifive_plic_set_pending(s, irq, level > 0); - sifive_plic_update(s); + if (level > 0) { + sifive_plic_set_pending(s, irq, true); + sifive_plic_update(s); + } } static void sifive_plic_realize(DeviceState *dev, Error **errp) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 5789062379c7a616f4f907136d5d0bea3a653e48..7a86197fc95a01841cab5968ba327475e4c12a24 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -720,7 +720,7 @@ int kvmppc_xive_connect(SpaprInterruptController *intc, uint32_t nr_servers, { SpaprXive *xive = SPAPR_XIVE(intc); XiveSource *xsrc = &xive->source; - size_t esb_len = xive_source_esb_len(xsrc); + uint64_t esb_len = xive_source_esb_len(xsrc); size_t tima_len = 4ull << TM_SHIFT; CPUState *cs; int fd; @@ -824,7 +824,7 @@ void kvmppc_xive_disconnect(SpaprInterruptController *intc) { SpaprXive *xive = SPAPR_XIVE(intc); XiveSource *xsrc; - size_t esb_len; + uint64_t esb_len; assert(xive->fd != -1); diff --git a/hw/intc/xive.c b/hw/intc/xive.c index a3585593d8f495f9699eebfb050f83f67c1bd389..0cfc172dd4c7f3ba7f24982c1f337229f48fb3d8 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -1238,7 +1238,7 @@ static void xive_source_reset(void *dev) static void xive_source_realize(DeviceState *dev, Error **errp) { XiveSource *xsrc = XIVE_SOURCE(dev); - size_t esb_len = xive_source_esb_len(xsrc); + uint64_t esb_len = xive_source_esb_len(xsrc); assert(xsrc->xive); diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 9c2333a277d9ea253b50e1fc09b00c481913b03e..0334431219c88316fab11b983a921d5575863488 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -613,7 +613,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level) ViaISAState *s = VIA_ISA(pci_get_function_0(d)); uint8_t irq = d->config[PCI_INTERRUPT_LINE], max_irq = 15; int f = PCI_FUNC(d->devfn); - uint16_t mask = BIT(f); + uint16_t mask; switch (f) { case 0: /* PIRQ/PINT inputs */ @@ -628,6 +628,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level) } /* Keep track of the state of all sources */ + mask = BIT(f); if (level) { s->irq_state[0] |= mask; } else { diff --git a/hw/loongarch/Kconfig b/hw/loongarch/Kconfig index 5727efed6d8442bee0217d2dcd0f2e6c1cd8c52e..16c854c0d595f54aa4a9bcf194f19cbbbccd9331 100644 --- a/hw/loongarch/Kconfig +++ b/hw/loongarch/Kconfig @@ -1,10 +1,12 @@ config LOONGARCH_VIRT bool + default y + depends on LOONGARCH64 select PCI select PCI_EXPRESS_GENERIC_BRIDGE - imply VIRTIO_VGA imply PCI_DEVICES imply NVDIMM + imply TPM_TIS_SYSBUS select SERIAL select VIRTIO_PCI select PLATFORM_BUS @@ -12,8 +14,12 @@ config LOONGARCH_VIRT select LOONGARCH_PCH_PIC select LOONGARCH_PCH_MSI select LOONGARCH_EXTIOI + select LOONGARCH_IPI_KVM if KVM + select LOONGARCH_PCH_PIC_KVM if KVM + select LOONGARCH_EXTIOI_KVM if KVM select LS7A_RTC select SMBIOS + select ACPI_CPU_HOTPLUG select ACPI_PCI select ACPI_HW_REDUCED select FW_CFG_DMA diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c index ae292fc5432670aed9b823f421428c859d9929f3..a54c5e0e70622e0e1212505df2cb6b47deef82d9 100644 --- a/hw/loongarch/acpi-build.c +++ b/hw/loongarch/acpi-build.c @@ -31,6 +31,7 @@ #include "hw/acpi/generic_event_device.h" #include "hw/pci-host/gpex.h" +#include "sysemu/sysemu.h" #include "sysemu/tpm.h" #include "hw/platform-bus.h" #include "hw/acpi/aml-build.h" @@ -46,6 +47,22 @@ #define ACPI_BUILD_DPRINTF(fmt, ...) #endif +static void virt_madt_cpu_entry(int uid, + const CPUArchIdList *apic_ids, + GArray *entry, bool force_enabled) +{ + uint32_t flags, apic_id = apic_ids->cpus[uid].arch_id; + + flags = apic_ids->cpus[uid].cpu || force_enabled ? 1 /* Enabled */ : 0; + + /* Rev 1.0b, Table 5-13 Processor Local APIC Structure */ + build_append_int_noprefix(entry, 0, 1); /* Type */ + build_append_int_noprefix(entry, 8, 1); /* Length */ + build_append_int_noprefix(entry, uid, 1); /* ACPI Processor ID */ + build_append_int_noprefix(entry, apic_id, 1); /* APIC ID */ + build_append_int_noprefix(entry, flags, 4); /* Flags */ +} + /* build FADT */ static void init_common_fadt_data(AcpiFadtData *data) { @@ -105,14 +122,15 @@ build_facs(GArray *table_data) /* build MADT */ static void -build_madt(GArray *table_data, BIOSLinker *linker, LoongArchMachineState *lams) +build_madt(GArray *table_data, BIOSLinker *linker, + LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); MachineClass *mc = MACHINE_GET_CLASS(ms); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(ms); int i, arch_id; - AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + AcpiTable table = { .sig = "APIC", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); @@ -121,15 +139,17 @@ build_madt(GArray *table_data, BIOSLinker *linker, LoongArchMachineState *lams) build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */ for (i = 0; i < arch_ids->len; i++) { + uint32_t flags; + /* Processor Core Interrupt Controller Structure */ arch_id = arch_ids->cpus[i].arch_id; - + flags = arch_ids->cpus[i].cpu ? 1 : 0; build_append_int_noprefix(table_data, 17, 1); /* Type */ build_append_int_noprefix(table_data, 15, 1); /* Length */ build_append_int_noprefix(table_data, 1, 1); /* Version */ build_append_int_noprefix(table_data, i, 4); /* ACPI Processor ID */ build_append_int_noprefix(table_data, arch_id, 4); /* Core ID */ - build_append_int_noprefix(table_data, 1, 4); /* Flags */ + build_append_int_noprefix(table_data, flags, 4); /* Flags */ } /* Extend I/O Interrupt Controller Structure */ @@ -165,13 +185,14 @@ static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { int i, arch_id, node_id; - uint64_t mem_len, mem_base; - int nb_numa_nodes = machine->numa_state->num_nodes; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); - MachineClass *mc = MACHINE_GET_CLASS(lams); + hwaddr len, base, gap; + NodeInfo *numa_info; + int nodes, nb_numa_nodes = machine->numa_state->num_nodes; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + MachineClass *mc = MACHINE_GET_CLASS(lvms); const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine); - AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + AcpiTable table = { .sig = "SRAT", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); build_append_int_noprefix(table_data, 1, 4); /* Reserved */ @@ -195,41 +216,88 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_append_int_noprefix(table_data, 0, 4); /* Reserved */ } - /* Node0 */ - build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, - 0, MEM_AFFINITY_ENABLED); - mem_base = VIRT_HIGHMEM_BASE; - if (!nb_numa_nodes) { - mem_len = machine->ram_size - VIRT_LOWMEM_SIZE; - } else { - mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + numa_info = machine->numa_state->nodes; + nodes = nb_numa_nodes; + if (!nodes) { + nodes = 1; } - if (mem_len) - build_srat_memory(table_data, mem_base, mem_len, 0, MEM_AFFINITY_ENABLED); - - /* Node1 - Nodemax */ - if (nb_numa_nodes) { - mem_base += mem_len; - for (i = 1; i < nb_numa_nodes; ++i) { - if (machine->numa_state->nodes[i].node_mem > 0) { - build_srat_memory(table_data, mem_base, - machine->numa_state->nodes[i].node_mem, i, - MEM_AFFINITY_ENABLED); - mem_base += machine->numa_state->nodes[i].node_mem; - } + + for (i = 0; i < nodes; i++) { + if (nb_numa_nodes) { + len = numa_info[i].node_mem; + } else { + len = machine->ram_size; + } + + /* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ + if (len >= gap) { + build_srat_memory(table_data, base, gap, i, MEM_AFFINITY_ENABLED); + len -= gap; + base = VIRT_HIGHMEM_BASE; + gap = machine->ram_size - VIRT_LOWMEM_SIZE; + } + + if (len) { + build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED); + base += len; + gap -= len; } } if (machine->device_memory) { build_srat_memory(table_data, machine->device_memory->base, memory_region_size(&machine->device_memory->mr), - nb_numa_nodes - 1, + nodes - 1, MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); } acpi_table_end(linker, &table); } +/* + * Serial Port Console Redirection Table (SPCR) + * https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table + */ +static void +spcr_setup(GArray *table_data, BIOSLinker *linker, MachineState *machine) +{ + LoongArchVirtMachineState *lvms; + AcpiSpcrData serial = { + .interface_type = 0, /* 16550 compatible */ + .base_addr.id = AML_AS_SYSTEM_MEMORY, + .base_addr.width = 32, + .base_addr.offset = 0, + .base_addr.size = 1, + .base_addr.addr = VIRT_UART_BASE, + .interrupt_type = 0, /* Interrupt not supported */ + .pc_interrupt = 0, + .interrupt = VIRT_UART_IRQ, + .baud_rate = 7, /* 115200 */ + .parity = 0, + .stop_bits = 1, + .flow_control = 0, + .terminal_type = 3, /* ANSI */ + .language = 0, /* Language */ + .pci_device_id = 0xffff, /* not a PCI device*/ + .pci_vendor_id = 0xffff, /* not a PCI device*/ + .pci_bus = 0, + .pci_device = 0, + .pci_function = 0, + .pci_flags = 0, + .pci_segment = 0, + }; + + lvms = LOONGARCH_VIRT_MACHINE(machine); + build_spcr(table_data, linker, &serial, 2, lvms->oem_id, + lvms->oem_table_id); +} + typedef struct AcpiBuildState { /* Copy of table in RAM (for patching). */ @@ -241,23 +309,27 @@ struct AcpiBuildState { MemoryRegion *linker_mr; } AcpiBuildState; -static void build_uart_device_aml(Aml *table) +static void build_uart_device_aml(Aml *table, int index) { Aml *dev; Aml *crs; Aml *pkg0, *pkg1, *pkg2; - uint32_t uart_irq = VIRT_UART_IRQ; - - Aml *scope = aml_scope("_SB"); - dev = aml_device("COMA"); + Aml *scope; + uint32_t uart_irq; + uint64_t base; + + uart_irq = VIRT_UART_IRQ + index; + base = VIRT_UART_BASE + index * VIRT_UART_SIZE; + scope = aml_scope("_SB"); + dev = aml_device("COM%d", index); aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501"))); - aml_append(dev, aml_name_decl("_UID", aml_int(0))); + aml_append(dev, aml_name_decl("_UID", aml_int(index))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); crs = aml_resource_template(); aml_append(crs, aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, AML_MAX_FIXED, AML_NON_CACHEABLE, AML_READ_WRITE, - 0, VIRT_UART_BASE, VIRT_UART_BASE + VIRT_UART_SIZE - 1, + 0, base, base + VIRT_UART_SIZE - 1, 0, VIRT_UART_SIZE)); aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH, AML_SHARED, &uart_irq, 1)); @@ -279,23 +351,36 @@ static void build_la_ged_aml(Aml *dsdt, MachineState *machine) { uint32_t event; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + CPUHotplugFeatures opts; build_ged_aml(dsdt, "\\_SB."GED_DEVICE, - HOTPLUG_HANDLER(lams->acpi_ged), + HOTPLUG_HANDLER(lvms->acpi_ged), VIRT_SCI_IRQ, AML_SYSTEM_MEMORY, VIRT_GED_EVT_ADDR); - event = object_property_get_uint(OBJECT(lams->acpi_ged), + event = object_property_get_uint(OBJECT(lvms->acpi_ged), "ged-event", &error_abort); if (event & ACPI_GED_MEM_HOTPLUG_EVT) { build_memory_hotplug_aml(dsdt, machine->ram_slots, "\\_SB", NULL, AML_SYSTEM_MEMORY, VIRT_GED_MEM_ADDR); } + + if (event & ACPI_GED_CPU_HOTPLUG_EVT) { + opts.acpi_1_compatible = false; + opts.has_legacy_cphp = false; + opts.fw_unplugs_cpu = false; + opts.smi_path = NULL; + + build_cpus_aml(dsdt, machine, opts, virt_madt_cpu_entry, NULL, + VIRT_GED_CPUHP_ADDR, "\\_SB", + NULL, AML_SYSTEM_MEMORY); + } + acpi_dsdt_add_power_button(dsdt); } -static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams) +static void build_pci_device_aml(Aml *scope, LoongArchVirtMachineState *lvms) { struct GPEXConfig cfg = { .mmio64.base = VIRT_PCI_MEM_BASE, @@ -305,31 +390,54 @@ static void build_pci_device_aml(Aml *scope, LoongArchMachineState *lams) .ecam.base = VIRT_PCI_CFG_BASE, .ecam.size = VIRT_PCI_CFG_SIZE, .irq = VIRT_GSI_BASE + VIRT_DEVICE_IRQS, - .bus = lams->pci_bus, + .bus = lvms->pci_bus, }; acpi_dsdt_add_gpex(scope, &cfg); } -static void build_flash_aml(Aml *scope, LoongArchMachineState *lams) +static void build_flash_aml(Aml *scope, LoongArchVirtMachineState *lvms) { Aml *dev, *crs; + MemoryRegion *flash_mem; - hwaddr flash_base = VIRT_FLASH_BASE; - hwaddr flash_size = VIRT_FLASH_SIZE; + hwaddr flash0_base; + hwaddr flash0_size; + + hwaddr flash1_base; + hwaddr flash1_size; + + flash_mem = pflash_cfi01_get_memory(lvms->flash[0]); + flash0_base = flash_mem->addr; + flash0_size = memory_region_size(flash_mem); + + flash_mem = pflash_cfi01_get_memory(lvms->flash[1]); + flash1_base = flash_mem->addr; + flash1_size = memory_region_size(flash_mem); dev = aml_device("FLS0"); aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015"))); aml_append(dev, aml_name_decl("_UID", aml_int(0))); crs = aml_resource_template(); - aml_append(crs, aml_memory32_fixed(flash_base, flash_size, AML_READ_WRITE)); + aml_append(crs, aml_memory32_fixed(flash0_base, flash0_size, + AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + + dev = aml_device("FLS1"); + aml_append(dev, aml_name_decl("_HID", aml_string("LNRO0015"))); + aml_append(dev, aml_name_decl("_UID", aml_int(1))); + + crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(flash1_base, flash1_size, + AML_READ_WRITE)); aml_append(dev, aml_name_decl("_CRS", crs)); aml_append(scope, dev); } #ifdef CONFIG_TPM -static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms) +static void acpi_dsdt_add_tpm(Aml *scope, LoongArchVirtMachineState *vms) { PlatformBusDevice *pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev); hwaddr pbus_base = VIRT_PLATFORM_BUS_BASEADDRESS; @@ -367,19 +475,21 @@ static void acpi_dsdt_add_tpm(Aml *scope, LoongArchMachineState *vms) static void build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) { + int i; Aml *dsdt, *scope, *pkg; - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); - AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lams->oem_id, - .oem_table_id = lams->oem_table_id }; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); + AcpiTable table = { .sig = "DSDT", .rev = 1, .oem_id = lvms->oem_id, + .oem_table_id = lvms->oem_table_id }; acpi_table_begin(&table, table_data); dsdt = init_aml_allocator(); - build_uart_device_aml(dsdt); - build_pci_device_aml(dsdt, lams); + for (i = 0; i < VIRT_UART_COUNT; i++) + build_uart_device_aml(dsdt, i); + build_pci_device_aml(dsdt, lvms); build_la_ged_aml(dsdt, machine); - build_flash_aml(dsdt, lams); + build_flash_aml(dsdt, lvms); #ifdef CONFIG_TPM - acpi_dsdt_add_tpm(dsdt, lams); + acpi_dsdt_add_tpm(dsdt, lvms); #endif /* System State Package */ scope = aml_scope("\\"); @@ -398,7 +508,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, MachineState *machine) static void acpi_build(AcpiBuildTables *tables, MachineState *machine) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); GArray *table_offsets; AcpiFadtData fadt_data; unsigned facs, rsdt, dsdt; @@ -432,28 +542,30 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) fadt_data.dsdt_tbl_offset = &dsdt; fadt_data.xdsdt_tbl_offset = &dsdt; build_fadt(tables_blob, tables->linker, &fadt_data, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); acpi_add_table(table_offsets, tables_blob); - build_madt(tables_blob, tables->linker, lams); + build_madt(tables_blob, tables->linker, lvms); acpi_add_table(table_offsets, tables_blob); build_pptt(tables_blob, tables->linker, machine, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); acpi_add_table(table_offsets, tables_blob); build_srat(tables_blob, tables->linker, machine); + acpi_add_table(table_offsets, tables_blob); + spcr_setup(tables_blob, tables->linker, machine); if (machine->numa_state->num_nodes) { if (machine->numa_state->have_numa_distance) { acpi_add_table(table_offsets, tables_blob); - build_slit(tables_blob, tables->linker, machine, lams->oem_id, - lams->oem_table_id); + build_slit(tables_blob, tables->linker, machine, lvms->oem_id, + lvms->oem_table_id); } if (machine->numa_state->hmat_enabled) { acpi_add_table(table_offsets, tables_blob); build_hmat(tables_blob, tables->linker, machine->numa_state, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); } } @@ -463,8 +575,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) .base = cpu_to_le64(VIRT_PCI_CFG_BASE), .size = cpu_to_le64(VIRT_PCI_CFG_SIZE), }; - build_mcfg(tables_blob, tables->linker, &mcfg, lams->oem_id, - lams->oem_table_id); + build_mcfg(tables_blob, tables->linker, &mcfg, lvms->oem_id, + lvms->oem_table_id); } #ifdef CONFIG_TPM @@ -472,8 +584,8 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) if (tpm_get_version(tpm_find()) == TPM_VERSION_2_0) { acpi_add_table(table_offsets, tables_blob); build_tpm2(tables_blob, tables->linker, - tables->tcpalog, lams->oem_id, - lams->oem_table_id); + tables->tcpalog, lvms->oem_id, + lvms->oem_table_id); } #endif /* Add tables supplied by user (if any) */ @@ -487,13 +599,13 @@ static void acpi_build(AcpiBuildTables *tables, MachineState *machine) /* RSDT is pointed to by RSDP */ rsdt = tables_blob->len; build_rsdt(tables_blob, tables->linker, table_offsets, - lams->oem_id, lams->oem_table_id); + lvms->oem_id, lvms->oem_table_id); /* RSDP is in FSEG memory, so allocate it separately */ { AcpiRsdpData rsdp_data = { .revision = 0, - .oem_id = lams->oem_id, + .oem_id = lvms->oem_id, .xsdt_tbl_offset = NULL, .rsdt_tbl_offset = &rsdt, }; @@ -570,17 +682,25 @@ static const VMStateDescription vmstate_acpi_build = { }, }; -void loongarch_acpi_setup(LoongArchMachineState *lams) +static bool loongarch_is_acpi_enabled(LoongArchVirtMachineState *lvms) +{ + if (lvms->acpi == ON_OFF_AUTO_OFF) { + return false; + } + return true; +} + +void loongarch_acpi_setup(LoongArchVirtMachineState *lvms) { AcpiBuildTables tables; AcpiBuildState *build_state; - if (!lams->fw_cfg) { + if (!lvms->fw_cfg) { ACPI_BUILD_DPRINTF("No fw cfg. Bailing out.\n"); return; } - if (!loongarch_is_acpi_enabled(lams)) { + if (!loongarch_is_acpi_enabled(lvms)) { ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n"); return; } @@ -588,7 +708,7 @@ void loongarch_acpi_setup(LoongArchMachineState *lams) build_state = g_malloc0(sizeof *build_state); acpi_build_tables_init(&tables); - acpi_build(&tables, MACHINE(lams)); + acpi_build(&tables, MACHINE(lvms)); /* Now expose it all to Guest */ build_state->table_mr = acpi_add_rom_blob(acpi_build_update, @@ -604,6 +724,9 @@ void loongarch_acpi_setup(LoongArchMachineState *lams) build_state, tables.rsdp, ACPI_BUILD_RSDP_FILE); + fw_cfg_add_file(lvms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, tables.tcpalog->data, + acpi_data_len(tables.tcpalog)); + qemu_register_reset(acpi_build_reset, build_state); acpi_build_reset(build_state); vmstate_register(NULL, 0, &vmstate_acpi_build, build_state); diff --git a/hw/loongarch/boot.c b/hw/loongarch/boot.c new file mode 100644 index 0000000000000000000000000000000000000000..39c4a6d8c6db76d84a1c1e03b71433115f0f0c51 --- /dev/null +++ b/hw/loongarch/boot.c @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch boot helper functions. + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "target/loongarch/cpu.h" +#include "hw/loongarch/virt.h" +#include "hw/loader.h" +#include "elf.h" +#include "qemu/error-report.h" +#include "sysemu/reset.h" +#include "sysemu/qtest.h" + +struct memmap_entry *memmap_table; +unsigned memmap_entries; + +ram_addr_t initrd_offset; +uint64_t initrd_size; + +static const unsigned int slave_boot_code[] = { + /* Configure reset ebase. */ + 0x0400302c, /* csrwr $t0, LOONGARCH_CSR_EENTRY */ + + /* Disable interrupt. */ + 0x0380100c, /* ori $t0, $zero,0x4 */ + 0x04000180, /* csrxchg $zero, $t0, LOONGARCH_CSR_CRMD */ + + /* Clear mailbox. */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + 0x06481da0, /* iocsrwr.d $zero, $t1 */ + + /* Enable IPI interrupt. */ + 0x1400002c, /* lu12i.w $t0, 1(0x1) */ + 0x0400118c, /* csrxchg $t0, $t0, LOONGARCH_CSR_ECFG */ + 0x02fffc0c, /* addi.d $t0, $r0,-1(0xfff) */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038011ad, /* ori $t1, $t1, CORE_EN_OFF */ + 0x064819ac, /* iocsrwr.w $t0, $t1 */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + + /* Wait for wakeup <.L11>: */ + 0x06488000, /* idle 0x0 */ + 0x03400000, /* andi $zero, $zero, 0x0 */ + 0x064809ac, /* iocsrrd.w $t0, $t1 */ + 0x43fff59f, /* beqz $t0, -12(0x7ffff4) # 48 <.L11> */ + + /* Read and clear IPI interrupt. */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x064809ac, /* iocsrrd.w $t0, $t1 */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038031ad, /* ori $t1, $t1, CORE_CLEAR_OFF */ + 0x064819ac, /* iocsrwr.w $t0, $t1 */ + + /* Disable IPI interrupt. */ + 0x1400002c, /* lu12i.w $t0, 1(0x1) */ + 0x04001180, /* csrxchg $zero, $t0, LOONGARCH_CSR_ECFG */ + + /* Read mail buf and jump to specified entry */ + 0x1400002d, /* lu12i.w $t1, 1(0x1) */ + 0x038081ad, /* ori $t1, $t1, CORE_BUF_20 */ + 0x06480dac, /* iocsrrd.d $t0, $t1 */ + 0x00150181, /* move $ra, $t0 */ + 0x4c000020, /* jirl $zero, $ra,0 */ +}; + +static inline void *guidcpy(void *dst, const void *src) +{ + return memcpy(dst, src, sizeof(efi_guid_t)); +} + +static void init_efi_boot_memmap(struct efi_system_table *systab, + void *p, void *start) +{ + unsigned i; + struct efi_boot_memmap *boot_memmap = p; + efi_guid_t tbl_guid = LINUX_EFI_BOOT_MEMMAP_GUID; + + /* efi_configuration_table 1 */ + guidcpy(&systab->tables[0].guid, &tbl_guid); + systab->tables[0].table = (struct efi_configuration_table *)(p - start); + systab->nr_tables = 1; + + boot_memmap->desc_size = sizeof(efi_memory_desc_t); + boot_memmap->desc_ver = 1; + boot_memmap->map_size = 0; + + efi_memory_desc_t *map = p + sizeof(struct efi_boot_memmap); + for (i = 0; i < memmap_entries; i++) { + map = (void *)boot_memmap + sizeof(*map); + map[i].type = memmap_table[i].type; + map[i].phys_addr = ROUND_UP(memmap_table[i].address, 64 * KiB); + map[i].num_pages = ROUND_DOWN(memmap_table[i].address + + memmap_table[i].length - map[i].phys_addr, 64 * KiB); + p += sizeof(efi_memory_desc_t); + } +} + +static void init_efi_initrd_table(struct efi_system_table *systab, + void *p, void *start) +{ + efi_guid_t tbl_guid = LINUX_EFI_INITRD_MEDIA_GUID; + struct efi_initrd *initrd_table = p; + + /* efi_configuration_table 2 */ + guidcpy(&systab->tables[1].guid, &tbl_guid); + systab->tables[1].table = (struct efi_configuration_table *)(p - start); + systab->nr_tables = 2; + + initrd_table->base = initrd_offset; + initrd_table->size = initrd_size; +} + +static void init_efi_fdt_table(struct efi_system_table *systab) +{ + efi_guid_t tbl_guid = DEVICE_TREE_GUID; + + /* efi_configuration_table 3 */ + guidcpy(&systab->tables[2].guid, &tbl_guid); + systab->tables[2].table = (void *)FDT_BASE; + systab->nr_tables = 3; +} + +static void init_systab(struct loongarch_boot_info *info, void *p, void *start) +{ + void *bp_tables_start; + struct efi_system_table *systab = p; + + info->a2 = p - start; + + systab->hdr.signature = EFI_SYSTEM_TABLE_SIGNATURE; + systab->hdr.revision = EFI_SPECIFICATION_VERSION; + systab->hdr.revision = sizeof(struct efi_system_table), + systab->fw_revision = FW_VERSION << 16 | FW_PATCHLEVEL << 8; + systab->runtime = 0; + systab->boottime = 0; + systab->nr_tables = 0; + + p += ROUND_UP(sizeof(struct efi_system_table), 64 * KiB); + + systab->tables = p; + bp_tables_start = p; + + init_efi_boot_memmap(systab, p, start); + p += ROUND_UP(sizeof(struct efi_boot_memmap) + + sizeof(efi_memory_desc_t) * memmap_entries, 64 * KiB); + init_efi_initrd_table(systab, p, start); + p += ROUND_UP(sizeof(struct efi_initrd), 64 * KiB); + init_efi_fdt_table(systab); + + systab->tables = (struct efi_configuration_table *)(bp_tables_start - start); +} + +static void init_cmdline(struct loongarch_boot_info *info, void *p, void *start) +{ + hwaddr cmdline_addr = p - start; + + info->a0 = 1; + info->a1 = cmdline_addr; + + g_strlcpy(p, info->kernel_cmdline, COMMAND_LINE_SIZE); +} + +static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) +{ + return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); +} + +static void find_initrd_loadoffset(struct loongarch_boot_info *info, + uint64_t kernel_high, ssize_t kernel_size) +{ + hwaddr base, size, gap, low_end; + ram_addr_t initrd_end, initrd_start; + + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + initrd_start = ROUND_UP(kernel_high + 4 * kernel_size, 64 * KiB); + initrd_end = initrd_start + initrd_size; + + size = info->ram_size; + low_end = base + MIN(size, gap); + if (initrd_end <= low_end) { + initrd_offset = initrd_start; + return; + } + + if (size <= gap) { + error_report("The low memory too small for initial ram disk '%s'," + "You need to expand the memory space", + info->initrd_filename); + exit(1); + } + + /* + * Try to load initrd in the high memory + */ + size -= gap; + base = VIRT_HIGHMEM_BASE; + initrd_start = ROUND_UP(base, 64 * KiB); + if (initrd_size <= size) { + initrd_offset = initrd_start; + return; + } + + error_report("The high memory too small for initial ram disk '%s'," + "You need to expand the memory space", + info->initrd_filename); + exit(1); +} + +static int64_t load_kernel_info(struct loongarch_boot_info *info) +{ + uint64_t kernel_entry, kernel_low, kernel_high; + ssize_t kernel_size; + + kernel_size = load_elf(info->kernel_filename, NULL, + cpu_loongarch_virt_to_phys, NULL, + &kernel_entry, &kernel_low, + &kernel_high, NULL, 0, + EM_LOONGARCH, 1, 0); + + if (kernel_size < 0) { + error_report("could not load kernel '%s': %s", + info->kernel_filename, + load_elf_strerror(kernel_size)); + exit(1); + } + + if (info->initrd_filename) { + initrd_size = get_image_size(info->initrd_filename); + if (initrd_size > 0) { + find_initrd_loadoffset(info, kernel_high, kernel_size); + initrd_size = load_image_targphys(info->initrd_filename, initrd_offset, + initrd_size); + } + + if (initrd_size == (target_ulong)-1) { + error_report("could not load initial ram disk '%s'", + info->initrd_filename); + exit(1); + } + } else { + initrd_size = 0; + } + + return kernel_entry; +} + +void reset_load_elf(void *opaque) +{ + LoongArchCPU *cpu = opaque; + CPULoongArchState *env = &cpu->env; + + cpu_reset(CPU(cpu)); + if (env->load_elf) { + if (cpu == LOONGARCH_CPU(first_cpu)) { + env->gpr[4] = env->boot_info->a0; + env->gpr[5] = env->boot_info->a1; + env->gpr[6] = env->boot_info->a2; + } + cpu_set_pc(CPU(cpu), env->elf_address); + } +} + +static void fw_cfg_add_kernel_info(struct loongarch_boot_info *info, + FWCfgState *fw_cfg) +{ + /* + * Expose the kernel, the command line, and the initrd in fw_cfg. + * We don't process them here at all, it's all left to the + * firmware. + */ + load_image_to_fw_cfg(fw_cfg, + FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA, + info->kernel_filename, + false); + + if (info->initrd_filename) { + load_image_to_fw_cfg(fw_cfg, + FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA, + info->initrd_filename, false); + } + + if (info->kernel_cmdline) { + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, + strlen(info->kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, + info->kernel_cmdline); + } +} + +static void loongarch_firmware_boot(LoongArchVirtMachineState *lvms, + struct loongarch_boot_info *info) +{ + fw_cfg_add_kernel_info(info, lvms->fw_cfg); +} + +static void init_boot_rom(struct loongarch_boot_info *info, void *p) +{ + void *start = p; + + init_cmdline(info, p, start); + p += COMMAND_LINE_SIZE; + + init_systab(info, p, start); +} + +static void loongarch_direct_kernel_boot(struct loongarch_boot_info *info) +{ + void *p, *bp; + int64_t kernel_addr = VIRT_FLASH0_BASE; + LoongArchCPU *lacpu; + CPUState *cs; + + if (info->kernel_filename) { + kernel_addr = load_kernel_info(info); + } else { + if(!qtest_enabled()) { + warn_report("No kernel provided, booting from flash drive."); + } + } + + /* Load cmdline and system tables at [0 - 1 MiB] */ + p = g_malloc0(1 * MiB); + bp = p; + init_boot_rom(info, p); + rom_add_blob_fixed_as("boot_info", bp, 1 * MiB, 0, &address_space_memory); + + /* Load slave boot code at pflash0 . */ + void *boot_code = g_malloc0(VIRT_FLASH0_SIZE); + memcpy(boot_code, &slave_boot_code, sizeof(slave_boot_code)); + rom_add_blob_fixed("boot_code", boot_code, VIRT_FLASH0_SIZE, VIRT_FLASH0_BASE); + + CPU_FOREACH(cs) { + lacpu = LOONGARCH_CPU(cs); + lacpu->env.load_elf = true; + if (cs == first_cpu) { + lacpu->env.elf_address = kernel_addr; + } else { + lacpu->env.elf_address = VIRT_FLASH0_BASE; + } + lacpu->env.boot_info = info; + } + + g_free(boot_code); + g_free(bp); +} + +void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(ms); + int i; + + /* register reset function */ + for (i = 0; i < ms->smp.cpus; i++) { + qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(i))); + } + + info->kernel_filename = ms->kernel_filename; + info->kernel_cmdline = ms->kernel_cmdline; + info->initrd_filename = ms->initrd_filename; + + if (lvms->bios_loaded) { + loongarch_firmware_boot(lvms, info); + } else { + loongarch_direct_kernel_boot(info); + } +} diff --git a/hw/loongarch/fw_cfg.c b/hw/loongarch/fw_cfg.c index f15a17416c4814100ac9d45ff23b0abacc856eed..35aeb2decbd1ed87da6689672b134b6684321d62 100644 --- a/hw/loongarch/fw_cfg.c +++ b/hw/loongarch/fw_cfg.c @@ -17,7 +17,7 @@ static void fw_cfg_boot_set(void *opaque, const char *boot_device, fw_cfg_modify_i16(opaque, FW_CFG_BOOT_DEVICE, boot_device[0]); } -FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms) +FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms) { FWCfgState *fw_cfg; int max_cpus = ms->smp.max_cpus; diff --git a/hw/loongarch/fw_cfg.h b/hw/loongarch/fw_cfg.h index 7c0de4db4ad44a582fddd1cf3e44e140b2a0d8b3..27ee68286ed115f5e3f99b27c8d3ad2ec1d1550f 100644 --- a/hw/loongarch/fw_cfg.h +++ b/hw/loongarch/fw_cfg.h @@ -11,5 +11,5 @@ #include "hw/boards.h" #include "hw/nvram/fw_cfg.h" -FWCfgState *loongarch_fw_cfg_init(ram_addr_t ram_size, MachineState *ms); +FWCfgState *virt_fw_cfg_init(ram_addr_t ram_size, MachineState *ms); #endif diff --git a/hw/loongarch/meson.build b/hw/loongarch/meson.build index c0421502abab9a8ca9c3751b37969d9e6a8caf3d..d306d82c2ee0e6a671f8180c6554a7185d47c98e 100644 --- a/hw/loongarch/meson.build +++ b/hw/loongarch/meson.build @@ -1,6 +1,7 @@ loongarch_ss = ss.source_set() loongarch_ss.add(files( 'fw_cfg.c', + 'boot.c', )) loongarch_ss.add(when: 'CONFIG_LOONGARCH_VIRT', if_true: [files('virt.c'), fdt]) loongarch_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-build.c')) diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c index 4b7dc67a2d79a136438afa789d98916878d8a9e1..233297d78f397a5201a7b2a90d92315bd835a1ce 100644 --- a/hw/loongarch/virt.c +++ b/hw/loongarch/virt.c @@ -10,11 +10,14 @@ #include "qapi/error.h" #include "hw/boards.h" #include "hw/char/serial.h" +#include "sysemu/kvm.h" #include "sysemu/sysemu.h" #include "sysemu/qtest.h" #include "sysemu/runstate.h" #include "sysemu/reset.h" #include "sysemu/rtc.h" +#include "sysemu/tcg.h" +#include "sysemu/kvm.h" #include "hw/loongarch/virt.h" #include "exec/address-spaces.h" #include "hw/irq.h" @@ -44,17 +47,43 @@ #include "sysemu/tpm.h" #include "sysemu/block-backend.h" #include "hw/block/flash.h" +#include "hw/virtio/virtio-iommu.h" #include "qemu/error-report.h" +#include "qemu/guest-random.h" +#define FDT_IRQ_FLAGS_EDGE_LO_HI 1 +#define FDT_IRQ_FLAGS_EDGE_HI_LO 2 +#define FDT_IRQ_FLAGS_LEVEL_HI 4 +#define FDT_IRQ_FLAGS_LEVEL_LO 8 -struct loaderparams { - uint64_t ram_size; - const char *kernel_filename; - const char *kernel_cmdline; - const char *initrd_filename; -}; +static bool virt_is_veiointc_enabled(LoongArchVirtMachineState *lvms) +{ + if (lvms->veiointc == ON_OFF_AUTO_OFF) { + return false; + } + return true; +} -static void virt_flash_create(LoongArchMachineState *lams) +static void virt_get_veiointc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); + OnOffAuto veiointc = lvms->veiointc; + + visit_type_OnOffAuto(v, name, &veiointc, errp); +} + +static void virt_set_veiointc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); + + visit_type_OnOffAuto(v, name, &lvms->veiointc, errp); +} + +static PFlashCFI01 *virt_flash_create1(LoongArchVirtMachineState *lvms, + const char *name, + const char *alias_prop_name) { DeviceState *dev = qdev_new(TYPE_PFLASH_CFI01); @@ -66,84 +95,262 @@ static void virt_flash_create(LoongArchMachineState *lams) qdev_prop_set_uint16(dev, "id1", 0x18); qdev_prop_set_uint16(dev, "id2", 0x00); qdev_prop_set_uint16(dev, "id3", 0x00); - qdev_prop_set_string(dev, "name", "virt.flash"); - object_property_add_child(OBJECT(lams), "virt.flash", OBJECT(dev)); - object_property_add_alias(OBJECT(lams), "pflash", + qdev_prop_set_string(dev, "name", name); + object_property_add_child(OBJECT(lvms), name, OBJECT(dev)); + object_property_add_alias(OBJECT(lvms), alias_prop_name, OBJECT(dev), "drive"); + return PFLASH_CFI01(dev); +} - lams->flash = PFLASH_CFI01(dev); +static void virt_flash_create(LoongArchVirtMachineState *lvms) +{ + lvms->flash[0] = virt_flash_create1(lvms, "virt.flash0", "pflash0"); + lvms->flash[1] = virt_flash_create1(lvms, "virt.flash1", "pflash1"); } -static void virt_flash_map(LoongArchMachineState *lams, - MemoryRegion *sysmem) +static void virt_flash_map1(PFlashCFI01 *flash, + hwaddr base, hwaddr size, + MemoryRegion *sysmem) { - PFlashCFI01 *flash = lams->flash; DeviceState *dev = DEVICE(flash); - hwaddr base = VIRT_FLASH_BASE; - hwaddr size = VIRT_FLASH_SIZE; + BlockBackend *blk; + hwaddr real_size = size; - assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE)); - assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); + blk = pflash_cfi01_get_blk(flash); + if (blk) { + real_size = blk_getlength(blk); + assert(real_size && real_size <= size); + } + + assert(QEMU_IS_ALIGNED(real_size, VIRT_FLASH_SECTOR_SIZE)); + assert(real_size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); - qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); + qdev_prop_set_uint32(dev, "num-blocks", real_size / VIRT_FLASH_SECTOR_SIZE); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); memory_region_add_subregion(sysmem, base, sysbus_mmio_get_region(SYS_BUS_DEVICE(dev), 0)); +} + +static void virt_flash_map(LoongArchVirtMachineState *lvms, + MemoryRegion *sysmem) +{ + PFlashCFI01 *flash0 = lvms->flash[0]; + PFlashCFI01 *flash1 = lvms->flash[1]; + + virt_flash_map1(flash0, VIRT_FLASH0_BASE, VIRT_FLASH0_SIZE, sysmem); + virt_flash_map1(flash1, VIRT_FLASH1_BASE, VIRT_FLASH1_SIZE, sysmem); +} + +static void fdt_add_cpuic_node(LoongArchVirtMachineState *lvms, + uint32_t *cpuintc_phandle) +{ + MachineState *ms = MACHINE(lvms); + char *nodename; + + *cpuintc_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/cpuic"); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *cpuintc_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,cpu-interrupt-controller"); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1); + g_free(nodename); +} + +static void fdt_add_eiointc_node(LoongArchVirtMachineState *lvms, + uint32_t *cpuintc_phandle, + uint32_t *eiointc_phandle) +{ + MachineState *ms = MACHINE(lvms); + char *nodename; + hwaddr extioi_base = APIC_BASE; + hwaddr extioi_size = EXTIOI_SIZE; + *eiointc_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/eiointc@%" PRIx64, extioi_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *eiointc_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,ls2k2000-eiointc"); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 1); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *cpuintc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupts", 3); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, + extioi_base, 0x0, extioi_size); + g_free(nodename); } -static void fdt_add_flash_node(LoongArchMachineState *lams) +static void fdt_add_pch_pic_node(LoongArchVirtMachineState *lvms, + uint32_t *eiointc_phandle, + uint32_t *pch_pic_phandle) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); char *nodename; + hwaddr pch_pic_base = VIRT_PCH_REG_BASE; + hwaddr pch_pic_size = VIRT_PCH_REG_SIZE; - hwaddr flash_base = VIRT_FLASH_BASE; - hwaddr flash_size = VIRT_FLASH_SIZE; + *pch_pic_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/platic@%" PRIx64, pch_pic_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *pch_pic_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,pch-pic-1.0"); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0, + pch_pic_base, 0, pch_pic_size); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 2); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *eiointc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,pic-base-vec", 0); + g_free(nodename); +} - nodename = g_strdup_printf("/flash@%" PRIx64, flash_base); +static void fdt_add_pch_msi_node(LoongArchVirtMachineState *lvms, + uint32_t *eiointc_phandle, + uint32_t *pch_msi_phandle) +{ + MachineState *ms = MACHINE(lvms); + char *nodename; + hwaddr pch_msi_base = VIRT_PCH_MSI_ADDR_LOW; + hwaddr pch_msi_size = VIRT_PCH_MSI_SIZE; + + *pch_msi_phandle = qemu_fdt_alloc_phandle(ms->fdt); + nodename = g_strdup_printf("/msi@%" PRIx64, pch_msi_base); + qemu_fdt_add_subnode(ms->fdt, nodename); + qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", *pch_msi_phandle); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,pch-msi-1.0"); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", + 0, pch_msi_base, + 0, pch_msi_size); + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *eiointc_phandle); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-base-vec", + VIRT_PCH_PIC_IRQ_NUM); + qemu_fdt_setprop_cell(ms->fdt, nodename, "loongson,msi-num-vecs", + EXTIOI_IRQS - VIRT_PCH_PIC_IRQ_NUM); + g_free(nodename); +} + +static void fdt_add_flash_node(LoongArchVirtMachineState *lvms) +{ + MachineState *ms = MACHINE(lvms); + char *nodename; + MemoryRegion *flash_mem; + + hwaddr flash0_base; + hwaddr flash0_size; + + hwaddr flash1_base; + hwaddr flash1_size; + + flash_mem = pflash_cfi01_get_memory(lvms->flash[0]); + flash0_base = flash_mem->addr; + flash0_size = memory_region_size(flash_mem); + + flash_mem = pflash_cfi01_get_memory(lvms->flash[1]); + flash1_base = flash_mem->addr; + flash1_size = memory_region_size(flash_mem); + + nodename = g_strdup_printf("/flash@%" PRIx64, flash0_base); qemu_fdt_add_subnode(ms->fdt, nodename); qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "cfi-flash"); qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", - 2, flash_base, 2, flash_size); + 2, flash0_base, 2, flash0_size, + 2, flash1_base, 2, flash1_size); qemu_fdt_setprop_cell(ms->fdt, nodename, "bank-width", 4); g_free(nodename); } -static void fdt_add_rtc_node(LoongArchMachineState *lams) +static void fdt_add_rtc_node(LoongArchVirtMachineState *lvms, + uint32_t *pch_pic_phandle) { char *nodename; hwaddr base = VIRT_RTC_REG_BASE; hwaddr size = VIRT_RTC_LEN; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/rtc@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "loongson,ls7a-rtc"); + qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", + "loongson,ls7a-rtc"); qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", + VIRT_RTC_IRQ - VIRT_GSI_BASE , FDT_IRQ_FLAGS_EDGE_LO_HI); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *pch_pic_phandle); g_free(nodename); } -static void fdt_add_uart_node(LoongArchMachineState *lams) +static void fdt_add_ged_reset(LoongArchVirtMachineState *lvms) +{ + char *name; + uint32_t ged_handle; + MachineState *ms = MACHINE(lvms); + hwaddr base = VIRT_GED_REG_ADDR; + hwaddr size = ACPI_GED_REG_COUNT; + + ged_handle = qemu_fdt_alloc_phandle(ms->fdt); + name = g_strdup_printf("/ged@%" PRIx64, base); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon"); + qemu_fdt_setprop_cells(ms->fdt, name, "reg", 0x0, base, 0x0, size); + /* 8 bit registers */ + qemu_fdt_setprop_cell(ms->fdt, name, "reg-shift", 0); + qemu_fdt_setprop_cell(ms->fdt, name, "reg-io-width", 1); + qemu_fdt_setprop_cell(ms->fdt, name, "phandle", ged_handle); + ged_handle = qemu_fdt_get_phandle(ms->fdt, name); + g_free(name); + + name = g_strdup_printf("/reboot"); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-reboot"); + qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle); + qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_RESET); + qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_RESET_VALUE); + g_free(name); + + name = g_strdup_printf("/poweroff"); + qemu_fdt_add_subnode(ms->fdt, name); + qemu_fdt_setprop_string(ms->fdt, name, "compatible", "syscon-poweroff"); + qemu_fdt_setprop_cell(ms->fdt, name, "regmap", ged_handle); + qemu_fdt_setprop_cell(ms->fdt, name, "offset", ACPI_GED_REG_SLEEP_CTL); + qemu_fdt_setprop_cell(ms->fdt, name, "value", ACPI_GED_SLP_EN | + (ACPI_GED_SLP_TYP_S5 << ACPI_GED_SLP_TYP_POS)); + g_free(name); +} + +static void fdt_add_uart_node(LoongArchVirtMachineState *lvms, + uint32_t *pch_pic_phandle, hwaddr base, + int irq, bool chosen) { char *nodename; - hwaddr base = VIRT_UART_BASE; hwaddr size = VIRT_UART_SIZE; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/serial@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", "ns16550a"); qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 0x0, base, 0x0, size); qemu_fdt_setprop_cell(ms->fdt, nodename, "clock-frequency", 100000000); - qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); + if (chosen) + qemu_fdt_setprop_string(ms->fdt, "/chosen", "stdout-path", nodename); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupts", irq, FDT_IRQ_FLAGS_LEVEL_HI); + qemu_fdt_setprop_cell(ms->fdt, nodename, "interrupt-parent", + *pch_pic_phandle); g_free(nodename); } -static void create_fdt(LoongArchMachineState *lams) +static void create_fdt(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); + uint8_t rng_seed[32]; - ms->fdt = create_device_tree(&lams->fdt_size); + ms->fdt = create_device_tree(&lvms->fdt_size); if (!ms->fdt) { error_report("create_device_tree() failed"); exit(1); @@ -155,12 +362,16 @@ static void create_fdt(LoongArchMachineState *lams) qemu_fdt_setprop_cell(ms->fdt, "/", "#address-cells", 0x2); qemu_fdt_setprop_cell(ms->fdt, "/", "#size-cells", 0x2); qemu_fdt_add_subnode(ms->fdt, "/chosen"); + + /* Pass seed to RNG */ + qemu_guest_getrandom_nofail(rng_seed, sizeof(rng_seed)); + qemu_fdt_setprop(ms->fdt, "/chosen", "rng-seed", rng_seed, sizeof(rng_seed)); } -static void fdt_add_cpu_nodes(const LoongArchMachineState *lams) +static void fdt_add_cpu_nodes(const LoongArchVirtMachineState *lvms) { int num; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); int smp_cpus = ms->smp.cpus; qemu_fdt_add_subnode(ms->fdt, "/cpus"); @@ -214,11 +425,11 @@ static void fdt_add_cpu_nodes(const LoongArchMachineState *lams) } } -static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams) +static void fdt_add_fw_cfg_node(const LoongArchVirtMachineState *lvms) { char *nodename; hwaddr base = VIRT_FWCFG_BASE; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/fw_cfg@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -230,7 +441,62 @@ static void fdt_add_fw_cfg_node(const LoongArchMachineState *lams) g_free(nodename); } -static void fdt_add_pcie_node(const LoongArchMachineState *lams) +static void fdt_add_pcie_irq_map_node(const LoongArchVirtMachineState *lvms, + char *nodename, + uint32_t *pch_pic_phandle) +{ + int pin, dev; + uint32_t irq_map_stride = 0; + uint32_t full_irq_map[GPEX_NUM_IRQS *GPEX_NUM_IRQS * 10] = {}; + uint32_t *irq_map = full_irq_map; + const MachineState *ms = MACHINE(lvms); + + /* This code creates a standard swizzle of interrupts such that + * each device's first interrupt is based on it's PCI_SLOT number. + * (See pci_swizzle_map_irq_fn()) + * + * We only need one entry per interrupt in the table (not one per + * possible slot) seeing the interrupt-map-mask will allow the table + * to wrap to any number of devices. + */ + + for (dev = 0; dev < GPEX_NUM_IRQS; dev++) { + int devfn = dev * 0x8; + + for (pin = 0; pin < GPEX_NUM_IRQS; pin++) { + int irq_nr = 16 + ((pin + PCI_SLOT(devfn)) % GPEX_NUM_IRQS); + int i = 0; + + /* Fill PCI address cells */ + irq_map[i] = cpu_to_be32(devfn << 8); + i += 3; + + /* Fill PCI Interrupt cells */ + irq_map[i] = cpu_to_be32(pin + 1); + i += 1; + + /* Fill interrupt controller phandle and cells */ + irq_map[i++] = cpu_to_be32(*pch_pic_phandle); + irq_map[i++] = cpu_to_be32(irq_nr); + + if (!irq_map_stride) { + irq_map_stride = i; + } + irq_map += irq_map_stride; + } + } + + + qemu_fdt_setprop(ms->fdt, nodename, "interrupt-map", full_irq_map, + GPEX_NUM_IRQS * GPEX_NUM_IRQS * + irq_map_stride * sizeof(uint32_t)); + qemu_fdt_setprop_cells(ms->fdt, nodename, "interrupt-map-mask", + 0x1800, 0, 0, 0x7); +} + +static void fdt_add_pcie_node(const LoongArchVirtMachineState *lvms, + uint32_t *pch_pic_phandle, + uint32_t *pch_msi_phandle) { char *nodename; hwaddr base_mmio = VIRT_PCI_MEM_BASE; @@ -241,7 +507,7 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams) hwaddr size_pcie = VIRT_PCI_CFG_SIZE; hwaddr base = base_pcie; - const MachineState *ms = MACHINE(lams); + const MachineState *ms = MACHINE(lvms); nodename = g_strdup_printf("/pcie@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); @@ -261,34 +527,11 @@ static void fdt_add_pcie_node(const LoongArchMachineState *lams) 2, base_pio, 2, size_pio, 1, FDT_PCI_RANGE_MMIO, 2, base_mmio, 2, base_mmio, 2, size_mmio); - g_free(nodename); -} - -static void fdt_add_irqchip_node(LoongArchMachineState *lams) -{ - MachineState *ms = MACHINE(lams); - char *nodename; - uint32_t irqchip_phandle; - - irqchip_phandle = qemu_fdt_alloc_phandle(ms->fdt); - qemu_fdt_setprop_cell(ms->fdt, "/", "interrupt-parent", irqchip_phandle); + qemu_fdt_setprop_cells(ms->fdt, nodename, "msi-map", + 0, *pch_msi_phandle, 0, 0x10000); - nodename = g_strdup_printf("/intc@%lx", VIRT_IOAPIC_REG_BASE); - qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#interrupt-cells", 3); - qemu_fdt_setprop(ms->fdt, nodename, "interrupt-controller", NULL, 0); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#address-cells", 0x2); - qemu_fdt_setprop_cell(ms->fdt, nodename, "#size-cells", 0x2); - qemu_fdt_setprop(ms->fdt, nodename, "ranges", NULL, 0); - - qemu_fdt_setprop_string(ms->fdt, nodename, "compatible", - "loongarch,ls7a"); + fdt_add_pcie_irq_map_node(lvms, nodename, pch_pic_phandle); - qemu_fdt_setprop_sized_cells(ms->fdt, nodename, "reg", - 2, VIRT_IOAPIC_REG_BASE, - 2, PCH_PIC_ROUTE_ENTRY_OFFSET); - - qemu_fdt_setprop_cell(ms->fdt, nodename, "phandle", irqchip_phandle); g_free(nodename); } @@ -298,7 +541,8 @@ static void fdt_add_memory_node(MachineState *ms, char *nodename = g_strdup_printf("/memory@%" PRIx64, base); qemu_fdt_add_subnode(ms->fdt, nodename); - qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_cells(ms->fdt, nodename, "reg", base >> 32, base, + size >> 32, size); qemu_fdt_setprop_string(ms->fdt, nodename, "device_type", "memory"); if (ms->numa_state && ms->numa_state->num_nodes) { @@ -308,15 +552,57 @@ static void fdt_add_memory_node(MachineState *ms, g_free(nodename); } -static void virt_build_smbios(LoongArchMachineState *lams) +static void fdt_add_memory_nodes(MachineState *ms) +{ + hwaddr base, size, ram_size, gap; + int i, nb_numa_nodes, nodes; + NodeInfo *numa_info; + + ram_size = ms->ram_size; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + nodes = nb_numa_nodes = ms->numa_state->num_nodes; + numa_info = ms->numa_state->nodes; + if (!nodes) { + nodes = 1; + } + + for (i = 0; i < nodes; i++) { + if (nb_numa_nodes) { + size = numa_info[i].node_mem; + } else { + size = ram_size; + } + + /* + * memory for the node splited into two part + * lowram: [base, +gap) + * highram: [VIRT_HIGHMEM_BASE, +(len - gap)) + */ + if (size >= gap) { + fdt_add_memory_node(ms, base, gap, i); + size -= gap; + base = VIRT_HIGHMEM_BASE; + gap = ram_size - VIRT_LOWMEM_SIZE; + } + + if (size) { + fdt_add_memory_node(ms, base, size, i); + base += size; + gap -= size; + } + } +} + +static void virt_build_smbios(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); - MachineClass *mc = MACHINE_GET_CLASS(lams); + MachineState *ms = MACHINE(lvms); + MachineClass *mc = MACHINE_GET_CLASS(lvms); uint8_t *smbios_tables, *smbios_anchor; size_t smbios_tables_len, smbios_anchor_len; const char *product = "QEMU Virtual Machine"; - if (!lams->fw_cfg) { + if (!lvms->fw_cfg) { return; } @@ -327,39 +613,29 @@ static void virt_build_smbios(LoongArchMachineState *lams) &smbios_anchor, &smbios_anchor_len, &error_fatal); if (smbios_anchor) { - fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-tables", + fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-tables", smbios_tables, smbios_tables_len); - fw_cfg_add_file(lams->fw_cfg, "etc/smbios/smbios-anchor", + fw_cfg_add_file(lvms->fw_cfg, "etc/smbios/smbios-anchor", smbios_anchor, smbios_anchor_len); } } -static void virt_machine_done(Notifier *notifier, void *data) +static void virt_done(Notifier *notifier, void *data) { - LoongArchMachineState *lams = container_of(notifier, - LoongArchMachineState, machine_done); - virt_build_smbios(lams); - loongarch_acpi_setup(lams); + LoongArchVirtMachineState *lvms = container_of(notifier, + LoongArchVirtMachineState, machine_done); + virt_build_smbios(lvms); + loongarch_acpi_setup(lvms); } static void virt_powerdown_req(Notifier *notifier, void *opaque) { - LoongArchMachineState *s = container_of(notifier, - LoongArchMachineState, powerdown_notifier); + LoongArchVirtMachineState *s; + s = container_of(notifier, LoongArchVirtMachineState, powerdown_notifier); acpi_send_event(s->acpi_ged, ACPI_POWER_DOWN_STATUS); } -struct memmap_entry { - uint64_t address; - uint64_t length; - uint32_t type; - uint32_t reserved; -}; - -static struct memmap_entry *memmap_table; -static unsigned memmap_entries; - static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) { /* Ensure there are no duplicate entries. */ @@ -376,40 +652,22 @@ static void memmap_add_entry(uint64_t address, uint64_t length, uint32_t type) memmap_entries++; } -static uint64_t cpu_loongarch_virt_to_phys(void *opaque, uint64_t addr) -{ - return addr & MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS); -} - -static int64_t load_kernel_info(const struct loaderparams *loaderparams) -{ - uint64_t kernel_entry, kernel_low, kernel_high; - ssize_t kernel_size; - - kernel_size = load_elf(loaderparams->kernel_filename, NULL, - cpu_loongarch_virt_to_phys, NULL, - &kernel_entry, &kernel_low, - &kernel_high, NULL, 0, - EM_LOONGARCH, 1, 0); - - if (kernel_size < 0) { - error_report("could not load kernel '%s': %s", - loaderparams->kernel_filename, - load_elf_strerror(kernel_size)); - exit(1); - } - return kernel_entry; -} - -static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState *lams) +static DeviceState *create_acpi_ged(DeviceState *pch_pic, + LoongArchVirtMachineState *lvms) { DeviceState *dev; - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); + MachineClass *mc = MACHINE_GET_CLASS(lvms); uint32_t event = ACPI_GED_PWR_DOWN_EVT; if (ms->ram_slots) { event |= ACPI_GED_MEM_HOTPLUG_EVT; } + + if (mc->has_hotpluggable_cpus) { + event |= ACPI_GED_CPU_HOTPLUG_EVT; + } + dev = qdev_new(TYPE_ACPI_GED); qdev_prop_set_uint32(dev, "ged-event", event); sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal); @@ -421,6 +679,10 @@ static DeviceState *create_acpi_ged(DeviceState *pch_pic, LoongArchMachineState /* ged regs used for reset and power down */ sysbus_mmio_map(SYS_BUS_DEVICE(dev), 2, VIRT_GED_REG_ADDR); + if (mc->has_hotpluggable_cpus) { + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 3, VIRT_GED_CPUHP_ADDR); + } + sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(pch_pic, VIRT_SCI_IRQ - VIRT_GSI_BASE)); return dev; @@ -451,9 +713,12 @@ static DeviceState *create_platform_bus(DeviceState *pch_pic) return dev; } -static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState *lams) +static void virt_devices_init(DeviceState *pch_pic, + LoongArchVirtMachineState *lvms, + uint32_t *pch_pic_phandle, + uint32_t *pch_msi_phandle) { - MachineClass *mc = MACHINE_GET_CLASS(lams); + MachineClass *mc = MACHINE_GET_CLASS(lvms); DeviceState *gpex_dev; SysBusDevice *d; PCIBus *pci_bus; @@ -465,7 +730,7 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState * d = SYS_BUS_DEVICE(gpex_dev); sysbus_realize_and_unref(d, &error_fatal); pci_bus = PCI_HOST_BRIDGE(gpex_dev)->bus; - lams->pci_bus = pci_bus; + lvms->pci_bus = pci_bus; /* Map only part size_ecam bytes of ECAM space */ ecam_alias = g_new0(MemoryRegion, 1); @@ -497,11 +762,21 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState * gpex_set_irq_num(GPEX_HOST(gpex_dev), i, 16 + i); } - serial_mm_init(get_system_memory(), VIRT_UART_BASE, 0, - qdev_get_gpio_in(pch_pic, - VIRT_UART_IRQ - VIRT_GSI_BASE), - 115200, serial_hd(0), DEVICE_LITTLE_ENDIAN); - fdt_add_uart_node(lams); + /* Add pcie node */ + fdt_add_pcie_node(lvms, pch_pic_phandle, pch_msi_phandle); + + /* + * Create uart fdt node in reverse order so that they appear + * in the finished device tree lowest address first + */ + for (i = VIRT_UART_COUNT; i --> 0;) { + hwaddr base = VIRT_UART_BASE + i * VIRT_UART_SIZE; + int irq = VIRT_UART_IRQ + i - VIRT_GSI_BASE; + serial_mm_init(get_system_memory(), base, 0, + qdev_get_gpio_in(pch_pic, irq), + 115200, serial_hd(i), DEVICE_LITTLE_ENDIAN); + fdt_add_uart_node(lvms, pch_pic_phandle, base, irq, i == 0); + } /* Network init */ for (i = 0; i < nb_nics; i++) { @@ -516,17 +791,18 @@ static void loongarch_devices_init(DeviceState *pch_pic, LoongArchMachineState * sysbus_create_simple("ls7a_rtc", VIRT_RTC_REG_BASE, qdev_get_gpio_in(pch_pic, VIRT_RTC_IRQ - VIRT_GSI_BASE)); - fdt_add_rtc_node(lams); + fdt_add_rtc_node(lvms, pch_pic_phandle); + fdt_add_ged_reset(lvms); /* acpi ged */ - lams->acpi_ged = create_acpi_ged(pch_pic, lams); + lvms->acpi_ged = create_acpi_ged(pch_pic, lvms); /* platform bus */ - lams->platform_bus_dev = create_platform_bus(pch_pic); + lvms->platform_bus_dev = create_platform_bus(pch_pic); } -static void loongarch_irq_init(LoongArchMachineState *lams) +static void virt_irq_init(LoongArchVirtMachineState *lvms) { - MachineState *ms = MACHINE(lams); + MachineState *ms = MACHINE(lvms); DeviceState *pch_pic, *pch_msi, *cpudev; DeviceState *ipi, *extioi; SysBusDevice *d; @@ -534,9 +810,7 @@ static void loongarch_irq_init(LoongArchMachineState *lams) CPULoongArchState *env; CPUState *cpu_state; int cpu, pin, i, start, num; - - extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); - sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + uint32_t cpuintc_phandle, eiointc_phandle, pch_pic_phandle, pch_msi_phandle; /* * The connection of interrupts: @@ -559,93 +833,160 @@ static void loongarch_irq_init(LoongArchMachineState *lams) * | UARTs | | Devices | | Devices | * +--------+ +---------+ +---------+ */ - for (cpu = 0; cpu < ms->smp.cpus; cpu++) { - cpu_state = qemu_get_cpu(cpu); - cpudev = DEVICE(cpu_state); - lacpu = LOONGARCH_CPU(cpu_state); - env = &(lacpu->env); + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + ipi = qdev_new(TYPE_KVM_LOONGARCH_IPI); + qdev_prop_set_int32(ipi, "num-cpu", ms->smp.max_cpus); + sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); + } else { ipi = qdev_new(TYPE_LOONGARCH_IPI); + qdev_prop_set_uint32(ipi, "num-cpu", ms->smp.max_cpus); sysbus_realize_and_unref(SYS_BUS_DEVICE(ipi), &error_fatal); - /* connect ipi irq to cpu irq */ - qdev_connect_gpio_out(ipi, 0, qdev_get_gpio_in(cpudev, IRQ_IPI)); /* IPI iocsr memory region */ - memory_region_add_subregion(&env->system_iocsr, SMP_IPI_MAILBOX, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), - 0)); - memory_region_add_subregion(&env->system_iocsr, MAIL_SEND_ADDR, - sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), - 1)); - /* - * extioi iocsr memory region - * only one extioi is added on loongarch virt machine - * external device interrupt can only be routed to cpu 0-3 - */ - if (cpu < EXTIOI_CPUS) - memory_region_add_subregion(&env->system_iocsr, APIC_BASE, - sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), - cpu)); + memory_region_add_subregion(&lvms->system_iocsr, SMP_IPI_MAILBOX, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 0)); + memory_region_add_subregion(&lvms->system_iocsr, MAIL_SEND_ADDR, + sysbus_mmio_get_region(SYS_BUS_DEVICE(ipi), 1)); + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + cpu_state = qemu_get_cpu(cpu); + cpudev = DEVICE(cpu_state); + + /* connect ipi irq to cpu irq */ + qdev_connect_gpio_out(ipi, cpu, qdev_get_gpio_in(cpudev, IRQ_IPI)); + } + } + + /* Add cpu interrupt-controller */ + fdt_add_cpuic_node(lvms, &cpuintc_phandle); + + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + cpu_state = qemu_get_cpu(cpu); + cpudev = DEVICE(cpu_state); + lacpu = LOONGARCH_CPU(cpu_state); + env = &(lacpu->env); + env->address_space_iocsr = &lvms->as_iocsr; env->ipistate = ipi; } - /* - * connect ext irq to the cpu irq - * cpu_pin[9:2] <= intc_pin[7:0] - */ - for (cpu = 0; cpu < MIN(ms->smp.cpus, EXTIOI_CPUS); cpu++) { - cpudev = DEVICE(qemu_get_cpu(cpu)); - for (pin = 0; pin < LS3A_INTC_IP; pin++) { - qdev_connect_gpio_out(extioi, (cpu * 8 + pin), - qdev_get_gpio_in(cpudev, pin + 2)); + lvms->ipi = ipi; + + /* Create EXTIOI device */ + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + extioi = qdev_new(TYPE_KVM_LOONGARCH_EXTIOI); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); + qdev_prop_set_bit(extioi, "has-virtualization-extension", true); + sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + } else { + extioi = qdev_new(TYPE_LOONGARCH_EXTIOI); + qdev_prop_set_uint32(extioi, "num-cpu", ms->smp.max_cpus); + if (virt_is_veiointc_enabled(lvms)) { + qdev_prop_set_bit(extioi, "has-virtualization-extension", true); } + sysbus_realize_and_unref(SYS_BUS_DEVICE(extioi), &error_fatal); + memory_region_add_subregion(&lvms->system_iocsr, APIC_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 0)); + if (virt_is_veiointc_enabled(lvms)) { + memory_region_add_subregion(&lvms->system_iocsr, EXTIOI_VIRT_BASE, + sysbus_mmio_get_region(SYS_BUS_DEVICE(extioi), 1)); + } + /* + * connect ext irq to the cpu irq + * cpu_pin[9:2] <= intc_pin[7:0] + */ + for (cpu = 0; cpu < ms->smp.cpus; cpu++) { + cpudev = DEVICE(qemu_get_cpu(cpu)); + for (pin = 0; pin < LS3A_INTC_IP; pin++) { + qdev_connect_gpio_out(extioi, (cpu * 8 + pin), + qdev_get_gpio_in(cpudev, pin + 2)); + } + } } - pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); - num = VIRT_PCH_PIC_IRQ_NUM; - qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); - d = SYS_BUS_DEVICE(pch_pic); - sysbus_realize_and_unref(d, &error_fatal); - memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, - sysbus_mmio_get_region(d, 0)); - memory_region_add_subregion(get_system_memory(), + lvms->extioi = extioi; + + /* Add Extend I/O Interrupt Controller node */ + fdt_add_eiointc_node(lvms, &cpuintc_phandle, &eiointc_phandle); + + /* Add PCH PIC node */ + fdt_add_pch_pic_node(lvms, &eiointc_phandle, &pch_pic_phandle); + + /* Add PCH MSI node */ + fdt_add_pch_msi_node(lvms, &eiointc_phandle, &pch_msi_phandle); + + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + pch_pic = qdev_new(TYPE_KVM_LOONGARCH_PCH_PIC); + sysbus_realize_and_unref(SYS_BUS_DEVICE(pch_pic), &error_fatal); + } else { + pch_pic = qdev_new(TYPE_LOONGARCH_PCH_PIC); + num = VIRT_PCH_PIC_IRQ_NUM; + qdev_prop_set_uint32(pch_pic, "pch_pic_irq_num", num); + d = SYS_BUS_DEVICE(pch_pic); + sysbus_realize_and_unref(d, &error_fatal); + memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE, + sysbus_mmio_get_region(d, 0)); + memory_region_add_subregion(get_system_memory(), VIRT_IOAPIC_REG_BASE + PCH_PIC_ROUTE_ENTRY_OFFSET, sysbus_mmio_get_region(d, 1)); - memory_region_add_subregion(get_system_memory(), - VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO, - sysbus_mmio_get_region(d, 2)); - - /* Connect pch_pic irqs to extioi */ - for (i = 0; i < num; i++) { - qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); + memory_region_add_subregion(get_system_memory(), + VIRT_IOAPIC_REG_BASE + PCH_PIC_INT_STATUS_LO, + sysbus_mmio_get_region(d, 2)); + /* Connect pch_pic irqs to extioi */ + for (i = 0; i < num; i++) { + qdev_connect_gpio_out(DEVICE(d), i, qdev_get_gpio_in(extioi, i)); + } } pch_msi = qdev_new(TYPE_LOONGARCH_PCH_MSI); + num = VIRT_PCH_PIC_IRQ_NUM; start = num; num = EXTIOI_IRQS - start; qdev_prop_set_uint32(pch_msi, "msi_irq_base", start); qdev_prop_set_uint32(pch_msi, "msi_irq_num", num); d = SYS_BUS_DEVICE(pch_msi); sysbus_realize_and_unref(d, &error_fatal); - sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); - for (i = 0; i < num; i++) { + + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) { /* Connect pch_msi irqs to extioi */ - qdev_connect_gpio_out(DEVICE(d), i, - qdev_get_gpio_in(extioi, i + start)); + for (i = 0; i < num; i++) { + qdev_connect_gpio_out(DEVICE(d), i, + qdev_get_gpio_in(extioi, i + start)); + } } - loongarch_devices_init(pch_pic, lams); + sysbus_mmio_map(d, 0, VIRT_PCH_MSI_ADDR_LOW); + virt_devices_init(pch_pic, lvms, &pch_pic_phandle, &pch_msi_phandle); } -static void loongarch_firmware_init(LoongArchMachineState *lams) +static void virt_firmware_init(LoongArchVirtMachineState *lvms) { - char *filename = MACHINE(lams)->firmware; + char *filename = MACHINE(lvms)->firmware; char *bios_name = NULL; - int bios_size; + int bios_size, i; + BlockBackend *pflash_blk0; + MemoryRegion *mr; - lams->bios_loaded = false; + lvms->bios_loaded = false; - virt_flash_map(lams, get_system_memory()); + /* Map legacy -drive if=pflash to machine properties */ + for (i = 0; i < ARRAY_SIZE(lvms->flash); i++) { + pflash_cfi01_legacy_drive(lvms->flash[i], + drive_get(IF_PFLASH, 0, i)); + } + + virt_flash_map(lvms, get_system_memory()); + + pflash_blk0 = pflash_cfi01_get_blk(lvms->flash[0]); + + if (pflash_blk0) { + if (filename) { + error_report("cannot use both '-bios' and '-drive if=pflash'" + "options at once"); + exit(1); + } + lvms->bios_loaded = true; + return; + } if (filename) { bios_name = qemu_find_file(QEMU_FILE_TYPE_BIOS, filename); @@ -654,166 +995,251 @@ static void loongarch_firmware_init(LoongArchMachineState *lams) exit(1); } - bios_size = load_image_targphys(bios_name, VIRT_BIOS_BASE, VIRT_BIOS_SIZE); + mr = sysbus_mmio_get_region(SYS_BUS_DEVICE(lvms->flash[0]), 0); + bios_size = load_image_mr(bios_name, mr); if (bios_size < 0) { error_report("Could not load ROM image '%s'", bios_name); exit(1); } - g_free(bios_name); + lvms->bios_loaded = true; + } +} + + +static MemTxResult virt_iocsr_misc_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size, MemTxAttrs attrs) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); + uint64_t features; - memory_region_init_ram(&lams->bios, NULL, "loongarch.bios", - VIRT_BIOS_SIZE, &error_fatal); - memory_region_set_readonly(&lams->bios, true); - memory_region_add_subregion(get_system_memory(), VIRT_BIOS_BASE, &lams->bios); - lams->bios_loaded = true; + switch (addr) { + case MISC_FUNC_REG: + if (!virt_is_veiointc_enabled(lvms)) { + return MEMTX_OK; + } + + features = address_space_ldl(&lvms->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + attrs, NULL); + if (val & BIT_ULL(IOCSRM_EXTIOI_EN)) { + features |= BIT(EXTIOI_ENABLE); + } + if (val & BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE)) { + features |= BIT(EXTIOI_ENABLE_INT_ENCODE); + } + + address_space_stl(&lvms->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + features, attrs, NULL); } + return MEMTX_OK; } -static void reset_load_elf(void *opaque) +static MemTxResult virt_iocsr_misc_read(void *opaque, hwaddr addr, + uint64_t *data, + unsigned size, MemTxAttrs attrs) { - LoongArchCPU *cpu = opaque; - CPULoongArchState *env = &cpu->env; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(opaque); + uint64_t ret = 0; + int features; + + switch (addr) { + case VERSION_REG: + ret = 0x11ULL; + break; + case FEATURE_REG: + ret = BIT(IOCSRF_MSI) | BIT(IOCSRF_EXTIOI) | BIT(IOCSRF_CSRIPI); + if (kvm_enabled()) { + ret |= BIT(IOCSRF_VM); + } + break; + case VENDOR_REG: + ret = 0x6e6f73676e6f6f4cULL; /* "Loongson" */ + break; + case CPUNAME_REG: + ret = 0x303030354133ULL; /* "3A5000" */ + break; + case MISC_FUNC_REG: + if (!virt_is_veiointc_enabled(lvms)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_EN); + break; + } + + features = address_space_ldl(&lvms->as_iocsr, + EXTIOI_VIRT_BASE + EXTIOI_VIRT_CONFIG, + attrs, NULL); + if (features & BIT(EXTIOI_ENABLE)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_EN); + } - cpu_reset(CPU(cpu)); - if (env->load_elf) { - cpu_set_pc(CPU(cpu), env->elf_address); + if (features & BIT(EXTIOI_ENABLE_INT_ENCODE)) { + ret |= BIT_ULL(IOCSRM_EXTIOI_INT_ENCODE); + } + break; + default: + g_assert_not_reached(); } + + *data = ret; + return MEMTX_OK; } -static void fw_cfg_add_kernel_info(const struct loaderparams *loaderparams, - FWCfgState *fw_cfg) +static const MemoryRegionOps virt_iocsr_misc_ops = { + .read_with_attrs = virt_iocsr_misc_read, + .write_with_attrs = virt_iocsr_misc_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 4, + .max_access_size = 8, + }, + .impl = { + .min_access_size = 8, + .max_access_size = 8, + }, +}; + +static void fw_cfg_add_memory(MachineState *ms) { - /* - * Expose the kernel, the command line, and the initrd in fw_cfg. - * We don't process them here at all, it's all left to the - * firmware. - */ - load_image_to_fw_cfg(fw_cfg, - FW_CFG_KERNEL_SIZE, FW_CFG_KERNEL_DATA, - loaderparams->kernel_filename, - false); - - if (loaderparams->initrd_filename) { - load_image_to_fw_cfg(fw_cfg, - FW_CFG_INITRD_SIZE, FW_CFG_INITRD_DATA, - loaderparams->initrd_filename, false); + hwaddr base, size, ram_size, gap; + int nb_numa_nodes, nodes; + NodeInfo *numa_info; + + ram_size = ms->ram_size; + base = VIRT_LOWMEM_BASE; + gap = VIRT_LOWMEM_SIZE; + nodes = nb_numa_nodes = ms->numa_state->num_nodes; + numa_info = ms->numa_state->nodes; + if (!nodes) { + nodes = 1; } - if (loaderparams->kernel_cmdline) { - fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, - strlen(loaderparams->kernel_cmdline) + 1); - fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, - loaderparams->kernel_cmdline); + /* add fw_cfg memory map of node0 */ + if (nb_numa_nodes) { + size = numa_info[0].node_mem; + } else { + size = ram_size; } -} -static void loongarch_firmware_boot(LoongArchMachineState *lams, - const struct loaderparams *loaderparams) -{ - fw_cfg_add_kernel_info(loaderparams, lams->fw_cfg); -} + if (size >= gap) { + memmap_add_entry(base, gap, 1); + size -= gap; + base = VIRT_HIGHMEM_BASE; + } -static void loongarch_direct_kernel_boot(LoongArchMachineState *lams, - const struct loaderparams *loaderparams) -{ - MachineState *machine = MACHINE(lams); - int64_t kernel_addr = 0; - LoongArchCPU *lacpu; - int i; + if (size) { + memmap_add_entry(base, size, 1); + base += size; + } - kernel_addr = load_kernel_info(loaderparams); - if (!machine->firmware) { - for (i = 0; i < machine->smp.cpus; i++) { - lacpu = LOONGARCH_CPU(qemu_get_cpu(i)); - lacpu->env.load_elf = true; - lacpu->env.elf_address = kernel_addr; - } + if (nodes < 2) { + return; } + + /* add fw_cfg memory map of other nodes */ + if (numa_info[0].node_mem < gap && ram_size > gap) { + /* + * memory map for the maining nodes splited into two part + * lowram: [base, +(gap - numa_info[0].node_mem)) + * highram: [VIRT_HIGHMEM_BASE, +(ram_size - gap)) + */ + memmap_add_entry(base, gap - numa_info[0].node_mem, 1); + size = ram_size - gap; + base = VIRT_HIGHMEM_BASE; + } else { + size = ram_size - numa_info[0].node_mem; + } + + if (size) + memmap_add_entry(base, size, 1); } -static void loongarch_init(MachineState *machine) +static void virt_init(MachineState *machine) { LoongArchCPU *lacpu; const char *cpu_model = machine->cpu_type; - ram_addr_t offset = 0; - ram_addr_t ram_size = machine->ram_size; - uint64_t highram_size = 0, phyAddr = 0; MemoryRegion *address_space_mem = get_system_memory(); - LoongArchMachineState *lams = LOONGARCH_MACHINE(machine); - int nb_numa_nodes = machine->numa_state->num_nodes; - NodeInfo *numa_info = machine->numa_state->nodes; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(machine); int i; - hwaddr fdt_base; - const CPUArchIdList *possible_cpus; + hwaddr base, size, ram_size = machine->ram_size; MachineClass *mc = MACHINE_GET_CLASS(machine); - CPUState *cpu; - char *ramName = NULL; - struct loaderparams loaderparams = { }; if (!cpu_model) { cpu_model = LOONGARCH_CPU_TYPE_NAME("la464"); } - if (ram_size < 1 * GiB) { - error_report("ram_size must be greater than 1G."); - exit(1); - } - create_fdt(lams); + create_fdt(lvms); + + /* Create IOCSR space */ + memory_region_init_io(&lvms->system_iocsr, OBJECT(machine), NULL, + machine, "iocsr", UINT64_MAX); + address_space_init(&lvms->as_iocsr, &lvms->system_iocsr, "IOCSR"); + memory_region_init_io(&lvms->iocsr_mem, OBJECT(machine), + &virt_iocsr_misc_ops, + machine, "iocsr_misc", 0x428); + memory_region_add_subregion(&lvms->system_iocsr, 0, &lvms->iocsr_mem); + /* Init CPUs */ + mc->possible_cpu_arch_ids(machine); + for (i = 0; i < machine->smp.cpus; i++) { + Object *cpuobj; + cpuobj = object_new(machine->cpu_type); + lacpu = LOONGARCH_CPU(cpuobj); - possible_cpus = mc->possible_cpu_arch_ids(machine); - for (i = 0; i < possible_cpus->len; i++) { - cpu = cpu_create(machine->cpu_type); - cpu->cpu_index = i; - machine->possible_cpus->cpus[i].cpu = OBJECT(cpu); - lacpu = LOONGARCH_CPU(cpu); lacpu->phy_id = machine->possible_cpus->cpus[i].arch_id; + object_property_set_int(cpuobj, "socket-id", + machine->possible_cpus->cpus[i].props.socket_id, + NULL); + object_property_set_int(cpuobj, "core-id", + machine->possible_cpus->cpus[i].props.core_id, + NULL); + object_property_set_int(cpuobj, "thread-id", + machine->possible_cpus->cpus[i].props.thread_id, + NULL); + /* + * The CPU in place at the time of machine startup will also enter + * the CPU hot-plug process when it is created, but at this time, + * the GED device has not been created, resulting in exit in the CPU + * hot-plug process, which can avoid the incumbent CPU repeatedly + * applying for resources. + * + * The interrupt resource of the in-place CPU will be requested at + * the current function call loongarch_irq_init(). + * + * The interrupt resource of the subsequently inserted CPU will be + * requested in the CPU hot-plug process. + */ + qdev_realize(DEVICE(cpuobj), NULL, &error_fatal); + object_unref(cpuobj); } - fdt_add_cpu_nodes(lams); + + fdt_add_cpu_nodes(lvms); + fdt_add_memory_nodes(machine); + fw_cfg_add_memory(machine); /* Node0 memory */ - memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1); - fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0); - memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram", - machine->ram, offset, VIRT_LOWMEM_SIZE); - memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem); - - offset += VIRT_LOWMEM_SIZE; - if (nb_numa_nodes > 0) { - assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE); - highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE; - } else { - highram_size = ram_size - VIRT_LOWMEM_SIZE; + size = ram_size; + base = VIRT_LOWMEM_BASE; + if (size > VIRT_LOWMEM_SIZE) { + size = VIRT_LOWMEM_SIZE; } - phyAddr = VIRT_HIGHMEM_BASE; - memmap_add_entry(phyAddr, highram_size, 1); - fdt_add_memory_node(machine, phyAddr, highram_size, 0); - memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram", - machine->ram, offset, highram_size); - memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem); - - /* Node1 - Nodemax memory */ - offset += highram_size; - phyAddr += highram_size; - - for (i = 1; i < nb_numa_nodes; i++) { - MemoryRegion *nodemem = g_new(MemoryRegion, 1); - ramName = g_strdup_printf("loongarch.node%d.ram", i); - memory_region_init_alias(nodemem, NULL, ramName, machine->ram, - offset, numa_info[i].node_mem); - memory_region_add_subregion(address_space_mem, phyAddr, nodemem); - memmap_add_entry(phyAddr, numa_info[i].node_mem, 1); - fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i); - offset += numa_info[i].node_mem; - phyAddr += numa_info[i].node_mem; + + memory_region_init_alias(&lvms->lowmem, NULL, "loongarch.lowram", + machine->ram, base, size); + memory_region_add_subregion(address_space_mem, base, &lvms->lowmem); + base += size; + if (ram_size - size) { + base = VIRT_HIGHMEM_BASE; + memory_region_init_alias(&lvms->highmem, NULL, "loongarch.highram", + machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size); + memory_region_add_subregion(address_space_mem, base, &lvms->highmem); + base += ram_size - size; } /* initialize device memory address space */ if (machine->ram_size < machine->maxram_size) { ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; - hwaddr device_mem_base; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -827,55 +1253,35 @@ static void loongarch_init(MachineState *machine) "%d bytes", TARGET_PAGE_SIZE); exit(EXIT_FAILURE); } - /* device memory base is the top of high memory address. */ - device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB); - machine_memory_devices_init(machine, device_mem_base, device_mem_size); + machine_memory_devices_init(machine, base, device_mem_size); } /* load the BIOS image. */ - loongarch_firmware_init(lams); + virt_firmware_init(lvms); /* fw_cfg init */ - lams->fw_cfg = loongarch_fw_cfg_init(ram_size, machine); - rom_set_fw(lams->fw_cfg); - if (lams->fw_cfg != NULL) { - fw_cfg_add_file(lams->fw_cfg, "etc/memmap", + lvms->fw_cfg = virt_fw_cfg_init(ram_size, machine); + rom_set_fw(lvms->fw_cfg); + if (lvms->fw_cfg != NULL) { + fw_cfg_add_file(lvms->fw_cfg, "etc/memmap", memmap_table, sizeof(struct memmap_entry) * (memmap_entries)); } - fdt_add_fw_cfg_node(lams); - loaderparams.ram_size = ram_size; - loaderparams.kernel_filename = machine->kernel_filename; - loaderparams.kernel_cmdline = machine->kernel_cmdline; - loaderparams.initrd_filename = machine->initrd_filename; - /* load the kernel. */ - if (loaderparams.kernel_filename) { - if (lams->bios_loaded) { - loongarch_firmware_boot(lams, &loaderparams); - } else { - loongarch_direct_kernel_boot(lams, &loaderparams); - } - } - fdt_add_flash_node(lams); - /* register reset function */ - for (i = 0; i < machine->smp.cpus; i++) { - lacpu = LOONGARCH_CPU(qemu_get_cpu(i)); - qemu_register_reset(reset_load_elf, lacpu); - } + fdt_add_fw_cfg_node(lvms); + fdt_add_flash_node(lvms); + /* Initialize the IO interrupt subsystem */ - loongarch_irq_init(lams); - fdt_add_irqchip_node(lams); - platform_bus_add_all_fdt_nodes(machine->fdt, "/intc", + virt_irq_init(lvms); + platform_bus_add_all_fdt_nodes(machine->fdt, "/platic", VIRT_PLATFORM_BUS_BASEADDRESS, VIRT_PLATFORM_BUS_SIZE, VIRT_PLATFORM_BUS_IRQ); - lams->machine_done.notify = virt_machine_done; - qemu_add_machine_init_done_notifier(&lams->machine_done); + lvms->machine_done.notify = virt_done; + qemu_add_machine_init_done_notifier(&lvms->machine_done); /* connect powerdown request */ - lams->powerdown_notifier.notify = virt_powerdown_req; - qemu_register_powerdown_notifier(&lams->powerdown_notifier); + lvms->powerdown_notifier.notify = virt_powerdown_req; + qemu_register_powerdown_notifier(&lvms->powerdown_notifier); - fdt_add_pcie_node(lams); /* * Since lowmem region starts from 0 and Linux kernel legacy start address * at 2 MiB, FDT base address is located at 1 MiB to avoid NULL pointer @@ -883,44 +1289,241 @@ static void loongarch_init(MachineState *machine) * Put the FDT into the memory map as a ROM image: this will ensure * the FDT is copied again upon reset, even if addr points into RAM. */ - fdt_base = 1 * MiB; - qemu_fdt_dumpdtb(machine->fdt, lams->fdt_size); - rom_add_blob_fixed("fdt", machine->fdt, lams->fdt_size, fdt_base); + qemu_fdt_dumpdtb(machine->fdt, lvms->fdt_size); + rom_add_blob_fixed_as("fdt", machine->fdt, lvms->fdt_size, FDT_BASE, + &address_space_memory); + qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds, + rom_ptr_for_as(&address_space_memory, FDT_BASE, lvms->fdt_size)); + + lvms->bootinfo.ram_size = ram_size; + loongarch_load_kernel(machine, &lvms->bootinfo); } -bool loongarch_is_acpi_enabled(LoongArchMachineState *lams) +static void virt_get_acpi(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) { - if (lams->acpi == ON_OFF_AUTO_OFF) { - return false; - } - return true; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); + OnOffAuto acpi = lvms->acpi; + + visit_type_OnOffAuto(v, name, &acpi, errp); } -static void loongarch_get_acpi(Object *obj, Visitor *v, const char *name, +static void virt_set_acpi(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); - OnOffAuto acpi = lams->acpi; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); - visit_type_OnOffAuto(v, name, &acpi, errp); + visit_type_OnOffAuto(v, name, &lvms->acpi, errp); } -static void loongarch_set_acpi(Object *obj, Visitor *v, const char *name, - void *opaque, Error **errp) +static void virt_initfn(Object *obj) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(obj); - visit_type_OnOffAuto(v, name, &lams->acpi, errp); + if (tcg_enabled()) { + lvms->veiointc = ON_OFF_AUTO_OFF; + } + lvms->acpi = ON_OFF_AUTO_AUTO; + lvms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); + lvms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); + virt_flash_create(lvms); } -static void loongarch_machine_initfn(Object *obj) +static int virt_get_arch_id_from_topo(MachineState *ms, LoongArchCPUTopo *topo) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(obj); + int arch_id, sock_vcpu_num, core_vcpu_num; + + /* + * calculate total logical cpus across socket/core/thread. + * For more information on how to calculate the arch_id, + * you can refer to the CPU Topology chapter of the + * docs/system/loongarch/virt.rst document. + */ + sock_vcpu_num = topo->socket_id * (ms->smp.threads * ms->smp.cores); + core_vcpu_num = topo->core_id * ms->smp.threads; + + /* get vcpu-id(logical cpu index) for this vcpu from this topology */ + arch_id = (sock_vcpu_num + core_vcpu_num) + topo->thread_id; + + assert(arch_id >= 0 && arch_id < ms->possible_cpus->len); - lams->acpi = ON_OFF_AUTO_AUTO; - lams->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); - lams->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); - virt_flash_create(lams); + return arch_id; +} + +/* find cpu slot in machine->possible_cpus by arch_id */ +static CPUArchId *virt_find_cpu_slot(MachineState *ms, int arch_id, int *index) +{ + int n; + for (n = 0; n < ms->possible_cpus->len; n++) { + if (ms->possible_cpus->cpus[n].arch_id == arch_id) { + if (index) { + *index = n; + } + return &ms->possible_cpus->cpus[n]; + } + } + + return NULL; +} + +static void virt_cpu_pre_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(OBJECT(hotplug_dev)); + MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(dev); + CPUArchId *cpu_slot; + Error *local_err = NULL; + LoongArchCPUTopo topo; + int arch_id, index; + + if (dev->hotplugged && !mc->has_hotpluggable_cpus) { + error_setg(&local_err, "CPU hotplug not supported for this machine"); + goto out; + } + + /* sanity check the cpu */ + if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { + error_setg(&local_err, "Invalid CPU type, expected cpu type: '%s'", + ms->cpu_type); + goto out; + } + + if ((cpu->thread_id < 0) || (cpu->thread_id >= ms->smp.threads)) { + error_setg(&local_err, + "Invalid thread-id %u specified, must be in range 1:%u", + cpu->thread_id, ms->smp.threads - 1); + goto out; + } + + if ((cpu->core_id < 0) || (cpu->core_id >= ms->smp.cores)) { + error_setg(&local_err, + "Invalid core-id %u specified, must be in range 1:%u", + cpu->core_id, ms->smp.cores - 1); + goto out; + } + + if ((cpu->socket_id < 0) || (cpu->socket_id >= ms->smp.sockets)) { + error_setg(&local_err, + "Invalid socket-id %u specified, must be in range 1:%u", + cpu->socket_id, ms->smp.sockets - 1); + goto out; + } + + topo.socket_id = cpu->socket_id; + topo.core_id = cpu->core_id; + topo.thread_id = cpu->thread_id; + arch_id = virt_get_arch_id_from_topo(ms, &topo); + cpu_slot = virt_find_cpu_slot(ms, arch_id, &index); + if (CPU(cpu_slot->cpu)) { + error_setg(&local_err, + "cpu(id%d=%d:%d:%d) with arch-id %" PRIu64 " exists", + cs->cpu_index, cpu->socket_id, cpu->core_id, + cpu->thread_id, cpu_slot->arch_id); + goto out; + } + cpu->phy_id = arch_id; + /* + * update cpu_index calculation method since it is easily used as index + * with possible_cpus array by function virt_cpu_index_to_props + */ + cs->cpu_index = index; + numa_cpu_pre_plug(cpu_slot, dev, &local_err); + return ; + +out: + error_propagate(errp, local_err); +} + +static void virt_cpu_unplug_request(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + Error *local_err = NULL; + HotplugHandlerClass *hhc; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(dev); + + if (!lvms->acpi_ged) { + error_setg(&local_err, "CPU hot unplug not supported without ACPI"); + error_propagate(errp, local_err); + return; + } + + if (cs->cpu_index == 0) { + error_setg(&local_err, + "hot-unplug of boot cpu(id%d=%d:%d:%d) not supported", + cs->cpu_index, cpu->socket_id, + cpu->core_id, cpu->thread_id); + error_propagate(errp, local_err); + return; + } + + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); +} + +static void virt_cpu_unplug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *cpu_slot; + HotplugHandlerClass *hhc; + Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL); + cpu_slot->cpu = NULL; + return; +} + +static void virt_cpu_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + CPUArchId *cpu_slot; + HotplugHandlerClass *hhc; + Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(dev); + CPUState *cs = CPU(cpu); + CPULoongArchState *env; + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + int pin; + + if (lvms->acpi_ged) { + env = &(cpu->env); + env->address_space_iocsr = &lvms->as_iocsr; + + qemu_register_reset(reset_load_elf, LOONGARCH_CPU(qemu_get_cpu(cs->cpu_index))); + env->ipistate = lvms->ipi; + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) { + /* connect ipi irq to cpu irq, logic cpu index used here */ + qdev_connect_gpio_out(lvms->ipi, cs->cpu_index, + qdev_get_gpio_in(dev, IRQ_IPI)); + + for (pin = 0; pin < LS3A_INTC_IP; pin++) { + qdev_connect_gpio_out(lvms->extioi, (cs->cpu_index * 8 + pin), + qdev_get_gpio_in(dev, pin + 2)); + } + } + hhc = HOTPLUG_HANDLER_GET_CLASS(lvms->acpi_ged); + hhc->plug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + + cpu_slot = virt_find_cpu_slot(MACHINE(lvms), cpu->phy_id, NULL); + cpu_slot->cpu = OBJECT(dev); + return; } static bool memhp_type_supported(DeviceState *dev) @@ -936,92 +1539,112 @@ static void virt_mem_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp); } -static void virt_machine_device_pre_plug(HotplugHandler *hotplug_dev, +static void virt_device_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { virt_mem_pre_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_pre_plug(hotplug_dev, dev, errp); } } static void virt_mem_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); /* the acpi ged is always exist */ - hotplug_handler_unplug_request(HOTPLUG_HANDLER(lams->acpi_ged), dev, + hotplug_handler_unplug_request(HOTPLUG_HANDLER(lvms->acpi_ged), dev, errp); } -static void virt_machine_device_unplug_request(HotplugHandler *hotplug_dev, +static void virt_device_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { virt_mem_unplug_request(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_unplug_request(hotplug_dev, dev, errp); } } static void virt_mem_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); - hotplug_handler_unplug(HOTPLUG_HANDLER(lams->acpi_ged), dev, errp); - pc_dimm_unplug(PC_DIMM(dev), MACHINE(lams)); + hotplug_handler_unplug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, errp); + pc_dimm_unplug(PC_DIMM(dev), MACHINE(lvms)); qdev_unrealize(dev); } -static void virt_machine_device_unplug(HotplugHandler *hotplug_dev, +static void virt_device_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { if (memhp_type_supported(dev)) { virt_mem_unplug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_unplug(hotplug_dev, dev, errp); } } static void virt_mem_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); - pc_dimm_plug(PC_DIMM(dev), MACHINE(lams)); - hotplug_handler_plug(HOTPLUG_HANDLER(lams->acpi_ged), + pc_dimm_plug(PC_DIMM(dev), MACHINE(lvms)); + hotplug_handler_plug(HOTPLUG_HANDLER(lvms->acpi_ged), dev, &error_abort); } -static void loongarch_machine_device_plug_cb(HotplugHandler *hotplug_dev, +static void virt_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - LoongArchMachineState *lams = LOONGARCH_MACHINE(hotplug_dev); - MachineClass *mc = MACHINE_GET_CLASS(lams); + LoongArchVirtMachineState *lvms = LOONGARCH_VIRT_MACHINE(hotplug_dev); + MachineClass *mc = MACHINE_GET_CLASS(lvms); + PlatformBusDevice *pbus; if (device_is_dynamic_sysbus(mc, dev)) { - if (lams->platform_bus_dev) { - platform_bus_link_device(PLATFORM_BUS_DEVICE(lams->platform_bus_dev), - SYS_BUS_DEVICE(dev)); + if (lvms->platform_bus_dev) { + pbus = PLATFORM_BUS_DEVICE(lvms->platform_bus_dev); + platform_bus_link_device(pbus, SYS_BUS_DEVICE(dev)); } } else if (memhp_type_supported(dev)) { virt_mem_plug(hotplug_dev, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU)) { + virt_cpu_plug(hotplug_dev, dev, errp); } } -static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine, - DeviceState *dev) +static HotplugHandler *virt_get_hotplug_handler(MachineState *machine, + DeviceState *dev) { MachineClass *mc = MACHINE_GET_CLASS(machine); if (device_is_dynamic_sysbus(mc, dev) || + object_dynamic_cast(OBJECT(dev), TYPE_LOONGARCH_CPU) || + object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI) || memhp_type_supported(dev)) { return HOTPLUG_HANDLER(machine); } return NULL; } +static void virt_get_cpu_topo_from_index(MachineState *ms, + LoongArchCPUTopo *topo, int index) +{ + topo->socket_id = index / (ms->smp.cores * ms->smp.threads); + topo->core_id = index / ms->smp.threads % ms->smp.cores; + topo->thread_id = index % ms->smp.threads; +} + static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) { int n; unsigned int max_cpus = ms->smp.max_cpus; + LoongArchCPUTopo topo; if (ms->possible_cpus) { assert(ms->possible_cpus->len == max_cpus); @@ -1032,23 +1655,24 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { + ms->possible_cpus->cpus[n].vcpus_count = ms->smp.threads; ms->possible_cpus->cpus[n].type = ms->cpu_type; - ms->possible_cpus->cpus[n].arch_id = n; + virt_get_cpu_topo_from_index(ms, &topo, n); ms->possible_cpus->cpus[n].props.has_socket_id = true; - ms->possible_cpus->cpus[n].props.socket_id = - n / (ms->smp.cores * ms->smp.threads); + ms->possible_cpus->cpus[n].props.socket_id = topo.socket_id; ms->possible_cpus->cpus[n].props.has_core_id = true; - ms->possible_cpus->cpus[n].props.core_id = - n / ms->smp.threads % ms->smp.cores; + ms->possible_cpus->cpus[n].props.core_id = topo.core_id; ms->possible_cpus->cpus[n].props.has_thread_id = true; - ms->possible_cpus->cpus[n].props.thread_id = n % ms->smp.threads; + ms->possible_cpus->cpus[n].props.thread_id = topo.thread_id; + ms->possible_cpus->cpus[n].arch_id = + virt_get_arch_id_from_topo(ms, &topo); } return ms->possible_cpus; } -static CpuInstanceProperties -virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index) +static CpuInstanceProperties virt_cpu_index_to_props(MachineState *ms, + unsigned cpu_index) { MachineClass *mc = MACHINE_GET_CLASS(ms); const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); @@ -1059,27 +1683,25 @@ virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index) static int64_t virt_get_default_cpu_node_id(const MachineState *ms, int idx) { - int64_t nidx = 0; + int64_t socket_id; if (ms->numa_state->num_nodes) { - nidx = idx / (ms->smp.cpus / ms->numa_state->num_nodes); - if (ms->numa_state->num_nodes <= nidx) { - nidx = ms->numa_state->num_nodes - 1; - } + socket_id = ms->possible_cpus->cpus[idx].props.socket_id; + return socket_id % ms->numa_state->num_nodes; + } else { + return 0; } - return nidx; } -static void loongarch_class_init(ObjectClass *oc, void *data) +static void virt_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); - mc->desc = "Loongson-3A5000 LS7A1000 machine"; - mc->init = loongarch_init; - mc->default_ram_size = 1 * GiB; + mc->init = virt_init; mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464"); mc->default_ram_id = "loongarch.ram"; + mc->desc = "QEMU LoongArch Virtual Machine"; mc->max_cpus = LOONGARCH_MAX_CPUS; mc->is_default = 1; mc->default_kernel_irqchip_split = false; @@ -1092,31 +1714,36 @@ static void loongarch_class_init(ObjectClass *oc, void *data) mc->numa_mem_supported = true; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; - mc->get_hotplug_handler = virt_machine_get_hotplug_handler; + mc->has_hotpluggable_cpus = true; + mc->get_hotplug_handler = virt_get_hotplug_handler; mc->default_nic = "virtio-net-pci"; - hc->plug = loongarch_machine_device_plug_cb; - hc->pre_plug = virt_machine_device_pre_plug; - hc->unplug_request = virt_machine_device_unplug_request; - hc->unplug = virt_machine_device_unplug; + hc->plug = virt_device_plug_cb; + hc->pre_plug = virt_device_pre_plug; + hc->unplug_request = virt_device_unplug_request; + hc->unplug = virt_device_unplug; object_class_property_add(oc, "acpi", "OnOffAuto", - loongarch_get_acpi, loongarch_set_acpi, + virt_get_acpi, virt_set_acpi, NULL, NULL); object_class_property_set_description(oc, "acpi", "Enable ACPI"); + object_class_property_add(oc, "v-eiointc", "OnOffAuto", + virt_get_veiointc, virt_set_veiointc, NULL, NULL); + object_class_property_set_description(oc, "v-eiointc", + "Enable Virt Extend I/O Interrupt Controller"); machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE); #ifdef CONFIG_TPM machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS); #endif } -static const TypeInfo loongarch_machine_types[] = { +static const TypeInfo virt_machine_types[] = { { - .name = TYPE_LOONGARCH_MACHINE, + .name = TYPE_LOONGARCH_VIRT_MACHINE, .parent = TYPE_MACHINE, - .instance_size = sizeof(LoongArchMachineState), - .class_init = loongarch_class_init, - .instance_init = loongarch_machine_initfn, + .instance_size = sizeof(LoongArchVirtMachineState), + .class_init = virt_class_init, + .instance_init = virt_initfn, .interfaces = (InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, { } @@ -1124,4 +1751,4 @@ static const TypeInfo loongarch_machine_types[] = { } }; -DEFINE_TYPES(loongarch_machine_types) +DEFINE_TYPES(virt_machine_types) diff --git a/hw/m68k/next-kbd.c b/hw/m68k/next-kbd.c index 0c348c18cf29bfc07cf7f29059fe34caccf52abc..880ebe36021c286c8b3d67114c83edb96b4746b5 100644 --- a/hw/m68k/next-kbd.c +++ b/hw/m68k/next-kbd.c @@ -68,7 +68,6 @@ struct NextKBDState { uint16_t shift; }; -static void queue_code(void *opaque, int code); /* lots of magic numbers here */ static uint32_t kbd_read_byte(void *opaque, hwaddr addr) @@ -166,68 +165,70 @@ static const MemoryRegionOps kbd_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; -static void nextkbd_event(void *opaque, int ch) -{ - /* - * Will want to set vars for caps/num lock - * if (ch & 0x80) -> key release - * there's also e0 escaped scancodes that might need to be handled - */ - queue_code(opaque, ch); -} - -static const unsigned char next_keycodes[128] = { - 0x00, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x50, 0x4F, - 0x4E, 0x1E, 0x1F, 0x20, 0x1D, 0x1C, 0x1B, 0x00, - 0x42, 0x43, 0x44, 0x45, 0x48, 0x47, 0x46, 0x06, - 0x07, 0x08, 0x00, 0x00, 0x2A, 0x00, 0x39, 0x3A, - 0x3B, 0x3C, 0x3D, 0x40, 0x3F, 0x3E, 0x2D, 0x2C, - 0x2B, 0x26, 0x00, 0x00, 0x31, 0x32, 0x33, 0x34, - 0x35, 0x37, 0x36, 0x2e, 0x2f, 0x30, 0x00, 0x00, - 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +static const int qcode_to_nextkbd_keycode[] = { + [Q_KEY_CODE_ESC] = 0x49, + [Q_KEY_CODE_1] = 0x4a, + [Q_KEY_CODE_2] = 0x4b, + [Q_KEY_CODE_3] = 0x4c, + [Q_KEY_CODE_4] = 0x4d, + [Q_KEY_CODE_5] = 0x50, + [Q_KEY_CODE_6] = 0x4f, + [Q_KEY_CODE_7] = 0x4e, + [Q_KEY_CODE_8] = 0x1e, + [Q_KEY_CODE_9] = 0x1f, + [Q_KEY_CODE_0] = 0x20, + [Q_KEY_CODE_MINUS] = 0x1d, + [Q_KEY_CODE_EQUAL] = 0x1c, + [Q_KEY_CODE_BACKSPACE] = 0x1b, + + [Q_KEY_CODE_Q] = 0x42, + [Q_KEY_CODE_W] = 0x43, + [Q_KEY_CODE_E] = 0x44, + [Q_KEY_CODE_R] = 0x45, + [Q_KEY_CODE_T] = 0x48, + [Q_KEY_CODE_Y] = 0x47, + [Q_KEY_CODE_U] = 0x46, + [Q_KEY_CODE_I] = 0x06, + [Q_KEY_CODE_O] = 0x07, + [Q_KEY_CODE_P] = 0x08, + [Q_KEY_CODE_RET] = 0x2a, + [Q_KEY_CODE_A] = 0x39, + [Q_KEY_CODE_S] = 0x3a, + + [Q_KEY_CODE_D] = 0x3b, + [Q_KEY_CODE_F] = 0x3c, + [Q_KEY_CODE_G] = 0x3d, + [Q_KEY_CODE_H] = 0x40, + [Q_KEY_CODE_J] = 0x3f, + [Q_KEY_CODE_K] = 0x3e, + [Q_KEY_CODE_L] = 0x2d, + [Q_KEY_CODE_SEMICOLON] = 0x2c, + [Q_KEY_CODE_APOSTROPHE] = 0x2b, + [Q_KEY_CODE_GRAVE_ACCENT] = 0x26, + [Q_KEY_CODE_Z] = 0x31, + [Q_KEY_CODE_X] = 0x32, + [Q_KEY_CODE_C] = 0x33, + [Q_KEY_CODE_V] = 0x34, + + [Q_KEY_CODE_B] = 0x35, + [Q_KEY_CODE_N] = 0x37, + [Q_KEY_CODE_M] = 0x36, + [Q_KEY_CODE_COMMA] = 0x2e, + [Q_KEY_CODE_DOT] = 0x2f, + [Q_KEY_CODE_SLASH] = 0x30, + + [Q_KEY_CODE_SPC] = 0x38, }; -static void queue_code(void *opaque, int code) +static void nextkbd_put_keycode(NextKBDState *s, int keycode) { - NextKBDState *s = NEXTKBD(opaque); KBDQueue *q = &s->queue; - int key = code & KD_KEYMASK; - int release = code & 0x80; - static int ext; - - if (code == 0xE0) { - ext = 1; - } - - if (code == 0x2A || code == 0x1D || code == 0x36) { - if (code == 0x2A) { - s->shift = KD_LSHIFT; - } else if (code == 0x36) { - s->shift = KD_RSHIFT; - ext = 0; - } else if (code == 0x1D && !ext) { - s->shift = KD_LCOMM; - } else if (code == 0x1D && ext) { - ext = 0; - s->shift = KD_RCOMM; - } - return; - } else if (code == (0x2A | 0x80) || code == (0x1D | 0x80) || - code == (0x36 | 0x80)) { - s->shift = 0; - return; - } if (q->count >= KBD_QUEUE_SIZE) { return; } - q->data[q->wptr] = next_keycodes[key] | release; - + q->data[q->wptr] = keycode; if (++q->wptr == KBD_QUEUE_SIZE) { q->wptr = 0; } @@ -241,6 +242,53 @@ static void queue_code(void *opaque, int code) /* s->update_irq(s->update_arg, 1); */ } +static void nextkbd_event(DeviceState *dev, QemuConsole *src, InputEvent *evt) +{ + NextKBDState *s = NEXTKBD(dev); + int qcode, keycode; + bool key_down = evt->u.key.data->down; + + qcode = qemu_input_key_value_to_qcode(evt->u.key.data->key); + if (qcode >= ARRAY_SIZE(qcode_to_nextkbd_keycode)) { + return; + } + + /* Shift key currently has no keycode, so handle separately */ + if (qcode == Q_KEY_CODE_SHIFT) { + if (key_down) { + s->shift |= KD_LSHIFT; + } else { + s->shift &= ~KD_LSHIFT; + } + } + + if (qcode == Q_KEY_CODE_SHIFT_R) { + if (key_down) { + s->shift |= KD_RSHIFT; + } else { + s->shift &= ~KD_RSHIFT; + } + } + + keycode = qcode_to_nextkbd_keycode[qcode]; + if (!keycode) { + return; + } + + /* If key release event, create keyboard break code */ + if (!key_down) { + keycode |= 0x80; + } + + nextkbd_put_keycode(s, keycode); +} + +static const QemuInputHandler nextkbd_handler = { + .name = "QEMU NeXT Keyboard", + .mask = INPUT_EVENT_MASK_KEY, + .event = nextkbd_event, +}; + static void nextkbd_reset(DeviceState *dev) { NextKBDState *nks = NEXTKBD(dev); @@ -256,7 +304,7 @@ static void nextkbd_realize(DeviceState *dev, Error **errp) memory_region_init_io(&s->mr, OBJECT(dev), &kbd_ops, s, "next.kbd", 0x1000); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->mr); - qemu_add_kbd_event_handler(nextkbd_event, s); + qemu_input_handler_register(dev, &nextkbd_handler); } static const VMStateDescription nextkbd_vmstate = { diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig index cc8a8c1418ff007bd6eaaa73f098415beef51735..2ea5c68eb5da574e4d77170bc1d54b88c65fdf86 100644 --- a/hw/misc/Kconfig +++ b/hw/misc/Kconfig @@ -200,4 +200,8 @@ config IOSB config XLNX_VERSAL_TRNG bool +config PSP_DEV + bool + default y + source macio/Kconfig diff --git a/hw/misc/aspeed_hace.c b/hw/misc/aspeed_hace.c index b07506ec04efe3dfe3faef16a5fc23e58dd40802..8706e3d37644f645a3e8bca83309e06f4a6278eb 100644 --- a/hw/misc/aspeed_hace.c +++ b/hw/misc/aspeed_hace.c @@ -123,6 +123,11 @@ static bool has_padding(AspeedHACEState *s, struct iovec *iov, if (*total_msg_len <= s->total_req_len) { uint32_t padding_size = s->total_req_len - *total_msg_len; uint8_t *padding = iov->iov_base; + + if (padding_size > req_len) { + return false; + } + *pad_offset = req_len - padding_size; if (padding[*pad_offset] == 0x80) { return true; diff --git a/hw/misc/bcm2835_property.c b/hw/misc/bcm2835_property.c index ff55a4e2cd2f94aeedf65cba047cc7caac3bb5ae..12a1bc558ab6fa52f6816f570fd9d199c4ffa28e 100644 --- a/hw/misc/bcm2835_property.c +++ b/hw/misc/bcm2835_property.c @@ -28,8 +28,6 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value) uint32_t tot_len; size_t resplen; uint32_t tmp; - int n; - uint32_t offset, length, color; /* * Copy the current state of the framebuffer config; we will update @@ -264,18 +262,25 @@ static void bcm2835_property_mbox_push(BCM2835PropertyState *s, uint32_t value) resplen = 16; break; case RPI_FWREQ_FRAMEBUFFER_SET_PALETTE: - offset = ldl_le_phys(&s->dma_as, value + 12); - length = ldl_le_phys(&s->dma_as, value + 16); - n = 0; - while (n < length - offset) { - color = ldl_le_phys(&s->dma_as, value + 20 + (n << 2)); - stl_le_phys(&s->dma_as, - s->fbdev->vcram_base + ((offset + n) << 2), color); - n++; + { + uint32_t offset = ldl_le_phys(&s->dma_as, value + 12); + uint32_t length = ldl_le_phys(&s->dma_as, value + 16); + int resp; + + if (offset > 255 || length < 1 || length > 256) { + resp = 1; /* invalid request */ + } else { + for (uint32_t e = 0; e < length; e++) { + uint32_t color = ldl_le_phys(&s->dma_as, value + 20 + (e << 2)); + stl_le_phys(&s->dma_as, + s->fbdev->vcram_base + ((offset + e) << 2), color); + } + resp = 0; } - stl_le_phys(&s->dma_as, value + 12, 0); + stl_le_phys(&s->dma_as, value + 12, resp); resplen = 4; break; + } case RPI_FWREQ_FRAMEBUFFER_GET_NUM_DISPLAYS: stl_le_phys(&s->dma_as, value + 12, 1); resplen = 4; diff --git a/hw/misc/edu.c b/hw/misc/edu.c index a1f8bc77e7701eb90abab3faef46164503e39560..e64a246d3febf9c096c27cb299b83d6fa1a0477d 100644 --- a/hw/misc/edu.c +++ b/hw/misc/edu.c @@ -115,7 +115,7 @@ static void edu_check_range(uint64_t addr, uint64_t size1, uint64_t start, uint64_t end2 = start + size2; if (within(addr, start, end2) && - end1 > addr && within(end1, start, end2)) { + end1 > addr && end1 <= end2) { return; } diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 044788802982426261b7e2e5750ad12d12d6d35b..f66491a7a7a68fac494fc7707e0e5ab22d160f17 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -278,6 +278,7 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, IVShmemState *s = IVSHMEM_COMMON(dev); EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; MSIVector *v = &s->msi_vectors[vector]; + KVMRouteChange c; int ret; IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector); @@ -287,11 +288,12 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, } assert(!v->unmasked); - ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev); + c = kvm_irqchip_begin_route_changes(kvm_state); + ret = kvm_irqchip_update_msi_route(&c, v->virq, msg, dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq); if (ret < 0) { @@ -400,6 +402,7 @@ static void close_peer_eventfds(IVShmemState *s, int posn) } g_free(s->peers[posn].eventfds); + s->peers[posn].eventfds = NULL; s->peers[posn].nb_eventfds = 0; } @@ -533,6 +536,10 @@ static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd, close(fd); return; } + if (peer->eventfds == NULL) { + peer->eventfds = g_new0(EventNotifier, s->vectors); + peer->nb_eventfds = 0; + } vector = peer->nb_eventfds++; IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd); diff --git a/hw/misc/meson.build b/hw/misc/meson.build index 36c20d5637f70cf0db17173a4cb388a2b65bb8f9..28cba0ac2859ce90fccd4f3f4b34adb8cacf8bdd 100644 --- a/hw/misc/meson.build +++ b/hw/misc/meson.build @@ -9,6 +9,7 @@ system_ss.add(when: 'CONFIG_UNIMP', if_true: files('unimp.c')) system_ss.add(when: 'CONFIG_EMPTY_SLOT', if_true: files('empty_slot.c')) system_ss.add(when: 'CONFIG_LED', if_true: files('led.c')) system_ss.add(when: 'CONFIG_PVPANIC_COMMON', if_true: files('pvpanic.c')) +system_ss.add(when: 'CONFIG_PSP_DEV', if_true: files('psp.c')) # ARM devices system_ss.add(when: 'CONFIG_PL310', if_true: files('arm_l2x0.c')) diff --git a/hw/misc/nrf51_rng.c b/hw/misc/nrf51_rng.c index fc86e1b697905d2ae2fb6c5c69a54181a53b06f0..e911b3a3a30114309f1869e4f8e33774c1f072c1 100644 --- a/hw/misc/nrf51_rng.c +++ b/hw/misc/nrf51_rng.c @@ -107,25 +107,25 @@ static void rng_write(void *opaque, hwaddr offset, break; case NRF51_RNG_REG_SHORTS: s->shortcut_stop_on_valrdy = - (value & BIT_MASK(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_SHORTS_VALRDY_STOP)) ? 1 : 0; break; case NRF51_RNG_REG_INTEN: s->interrupt_enabled = - (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) ? 1 : 0; break; case NRF51_RNG_REG_INTENSET: - if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) { + if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) { s->interrupt_enabled = 1; } break; case NRF51_RNG_REG_INTENCLR: - if (value & BIT_MASK(NRF51_RNG_REG_INTEN_VALRDY)) { + if (value & BIT(NRF51_RNG_REG_INTEN_VALRDY)) { s->interrupt_enabled = 0; } break; case NRF51_RNG_REG_CONFIG: s->filter_enabled = - (value & BIT_MASK(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0; + (value & BIT(NRF51_RNG_REG_CONFIG_DECEN)) ? 1 : 0; break; default: diff --git a/hw/misc/psp.c b/hw/misc/psp.c new file mode 100644 index 0000000000000000000000000000000000000000..03e8663027c689a36a1d9055ffd718efd2ec6e78 --- /dev/null +++ b/hw/misc/psp.c @@ -0,0 +1,307 @@ +/* + * hygon psp device emulation + * + * Copyright 2024 HYGON Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include "qemu/osdep.h" +#include "qemu/compiler.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "migration/vmstate.h" +#include "hw/qdev-properties.h" +#include "sysemu/runstate.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "exec/ramblock.h" +#include "hw/i386/e820_memory_layout.h" +#include + +#define TYPE_PSP_DEV "psp" +OBJECT_DECLARE_SIMPLE_TYPE(PSPDevState, PSP_DEV) + +struct PSPDevState { + /* Private */ + DeviceState pdev; + + /* Public */ + Notifier shutdown_notifier; + int dev_fd; + bool enabled; + + /** + * vid is used to identify a virtual machine in qemu. + * When a virtual machine accesses a tkm key, + * the TKM module uses different key spaces based on different vids. + */ + uint32_t vid; + /* pinned hugepage numbers */ + int hp_num; +}; + +#define PSP_DEV_PATH "/dev/hygon_psp_config" +#define HYGON_PSP_IOC_TYPE 'H' +#define PSP_IOC_MUTEX_ENABLE _IOWR(HYGON_PSP_IOC_TYPE, 1, NULL) +#define PSP_IOC_MUTEX_DISABLE _IOWR(HYGON_PSP_IOC_TYPE, 2, NULL) +#define PSP_IOC_VPSP_OPT _IOWR(HYGON_PSP_IOC_TYPE, 3, NULL) +#define PSP_IOC_PIN_USER_PAGE _IOWR(HYGON_PSP_IOC_TYPE, 4, NULL) +#define PSP_IOC_UNPIN_USER_PAGE _IOWR(HYGON_PSP_IOC_TYPE, 5, NULL) + +enum VPSP_DEV_CTRL_OPCODE { + VPSP_OP_VID_ADD, + VPSP_OP_VID_DEL, + VPSP_OP_SET_DEFAULT_VID_PERMISSION, + VPSP_OP_GET_DEFAULT_VID_PERMISSION, + VPSP_OP_SET_GPA, +}; + +struct psp_dev_ctrl { + unsigned char op; + unsigned char resv[3]; + union { + unsigned int vid; + // Set or check the permissions for the default VID + unsigned int def_vid_perm; + struct { + uint64_t gpa_start; + uint64_t gpa_end; + } gpa; + unsigned char reserved[128]; + } __attribute__ ((packed)) data; +}; + +static MemoryRegion *find_memory_region_by_name(MemoryRegion *root, const char *name) { + MemoryRegion *subregion; + MemoryRegion *result; + + if (strcmp(root->name, name) == 0) + return root; + + QTAILQ_FOREACH(subregion, &root->subregions, subregions_link) { + result = find_memory_region_by_name(subregion, name); + if (result) { + return result; + } + } + + return NULL; +} + +static int pin_user_hugepage(int fd, uint64_t vaddr) +{ + int ret; + + ret = ioctl(fd, PSP_IOC_PIN_USER_PAGE, vaddr); + /* 22: Invalid argument, some old kernel doesn't support this ioctl command */ + if (ret != 0 && errno == EINVAL) { + ret = 0; + } + return ret; +} + +static int unpin_user_hugepage(int fd, uint64_t vaddr) +{ + int ret; + + ret = ioctl(fd, PSP_IOC_UNPIN_USER_PAGE, vaddr); + /* 22: Invalid argument, some old kernel doesn't support this ioctl command */ + if (ret != 0 && errno == EINVAL) { + ret = 0; + } + return ret; +} + +static int pin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root) +{ + int ret = 0; + char mr_name[128] = {0}; + int i, pinned_num; + MemoryRegion *find_mr = NULL; + + for (i = 0 ; i < state->hp_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + error_report("fail to find memory region by name %s.", mr_name); + ret = -ENOMEM; + goto end; + } + + ret = pin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + if (ret) { + error_report("fail to pin_user_hugepage, ret: %d.", ret); + goto end; + } + } +end: + if (ret) { + pinned_num = i; + for (i = 0 ; i < pinned_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + continue; + } + unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + } + + } + return ret; +} + +static int unpin_psp_user_hugepages(struct PSPDevState *state, MemoryRegion *root) +{ + int ret = 0; + char mr_name[128] = {0}; + int i; + MemoryRegion *find_mr = NULL; + + for (i = 0 ; i < state->hp_num; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root, mr_name); + if (!find_mr) { + continue; + } + + ret = unpin_user_hugepage(state->dev_fd, (uint64_t)find_mr->ram_block->host); + if (ret) { + error_report("fail to unpin_user_hugepage, ret: %d.", ret); + goto end; + } + } +end: + return ret; +} + +static void psp_dev_destroy(PSPDevState *state) +{ + struct psp_dev_ctrl ctrl = { 0 }; + if (state && state->dev_fd) { + if (state->enabled) { + ctrl.op = VPSP_OP_VID_DEL; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_report("VPSP_OP_VID_DEL: %d", -errno); + } + + /* Unpin hugepage memory */ + if (unpin_psp_user_hugepages(state, get_system_memory())) { + error_report("unpin_psp_user_hugepages failed"); + } else { + state->enabled = false; + } + } + qemu_close(state->dev_fd); + state->dev_fd = 0; + } +} + +/** + * Guest OS performs shut down operations through 'shutdown' and 'powerdown' event. + * The 'powerdown' event will also trigger 'shutdown' in the end, + * so only attention to the 'shutdown' event. + * + * When Guest OS trigger 'reboot' or 'reset' event, to do nothing. +*/ +static void psp_dev_shutdown_notify(Notifier *notifier, void *data) +{ + PSPDevState *state = container_of(notifier, PSPDevState, shutdown_notifier); + psp_dev_destroy(state); +} + +static void psp_dev_realize(DeviceState *dev, Error **errp) +{ + int i; + char mr_name[128] = {0}; + struct psp_dev_ctrl ctrl = { 0 }; + PSPDevState *state = PSP_DEV(dev); + MemoryRegion *root_mr = get_system_memory(); + MemoryRegion *find_mr = NULL; + uint64_t ram2_start = 0, ram2_end = 0; + + state->dev_fd = qemu_open_old(PSP_DEV_PATH, O_RDWR); + if (state->dev_fd < 0) { + error_setg(errp, "fail to open %s, errno %d.", PSP_DEV_PATH, errno); + goto end; + } + + ctrl.op = VPSP_OP_VID_ADD; + ctrl.data.vid = state->vid; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_setg(errp, "psp_dev_realize VPSP_OP_VID_ADD vid %d, return %d", ctrl.data.vid, -errno); + goto end; + } + + for (i = 0 ;; ++i) { + sprintf(mr_name, "mem2-%d", i); + find_mr = find_memory_region_by_name(root_mr, mr_name); + if (!find_mr) + break; + + if (!ram2_start) + ram2_start = find_mr->addr; + ram2_end = find_mr->addr + find_mr->size - 1; + } + + state->hp_num = i; + + if (ram2_start != ram2_end) { + ctrl.op = VPSP_OP_SET_GPA; + ctrl.data.gpa.gpa_start = ram2_start; + ctrl.data.gpa.gpa_end = ram2_end; + if (ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl) < 0) { + error_setg(errp, "psp_dev_realize VPSP_OP_SET_GPA (start 0x%lx, end 0x%lx), return %d", + ram2_start, ram2_end, -errno); + goto del_vid; + } + + /* Pin hugepage memory */ + if(pin_psp_user_hugepages(state, root_mr)) { + error_setg(errp, "pin_psp_user_hugepages failed."); + goto del_vid; + } + } + + state->enabled = true; + state->shutdown_notifier.notify = psp_dev_shutdown_notify; + qemu_register_shutdown_notifier(&state->shutdown_notifier); + + return; +del_vid: + ctrl.op = VPSP_OP_VID_DEL; + ioctl(state->dev_fd, PSP_IOC_VPSP_OPT, &ctrl); +end: + return; +} + +static struct Property psp_dev_properties[] = { + DEFINE_PROP_UINT32("vid", PSPDevState, vid, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void psp_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->desc = "PSP Device"; + dc->realize = psp_dev_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, psp_dev_properties); +} + +static const TypeInfo psp_dev_info = { + .name = TYPE_PSP_DEV, + .parent = TYPE_DEVICE, + .instance_size = sizeof(PSPDevState), + .class_init = psp_dev_class_init, +}; + +static void psp_dev_register_types(void) +{ + type_register_static(&psp_dev_info); +} + +type_init(psp_dev_register_types) diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c index 73201f91395af81dea302332c7122ff4c3b3d84c..575df7d2f8dc3d5b7143509abdf9740b407fda63 100644 --- a/hw/net/can/can_sja1000.c +++ b/hw/net/can/can_sja1000.c @@ -108,7 +108,7 @@ void can_sja_single_filter(struct qemu_can_filter *filter, } filter->can_mask = (uint32_t)amr[0] << 3; - filter->can_mask |= (uint32_t)amr[1] << 5; + filter->can_mask |= (uint32_t)amr[1] >> 5; filter->can_mask = ~filter->can_mask & QEMU_CAN_SFF_MASK; if (!(amr[1] & 0x10)) { filter->can_mask |= QEMU_CAN_RTR_FLAG; diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index 69e1c4bb8918fa506cf9f420edb257a0f5424127..f6204ec0590005eb8c6bfe221805fd61be4ae6ec 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -279,6 +279,9 @@ typedef struct { /* Quasi static device properties (no need to save them). */ uint16_t stats_size; bool has_extended_tcb_support; + + /* Flag to avoid recursions. */ + bool busy; } EEPRO100State; /* Word indices in EEPROM. */ @@ -844,6 +847,14 @@ static void action_command(EEPRO100State *s) Therefore we limit the number of iterations. */ unsigned max_loop_count = 16; + if (s->busy) { + /* Prevent recursions. */ + logout("recursion in %s:%u\n", __FILE__, __LINE__); + return; + } + + s->busy = true; + for (;;) { bool bit_el; bool bit_s; @@ -940,6 +951,7 @@ static void action_command(EEPRO100State *s) } TRACE(OTHER, logout("CU list empty\n")); /* List is empty. Now CU is idle or suspended. */ + s->busy = false; } static void eepro100_cu_command(EEPRO100State * s, uint8_t val) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index 2e5f58b3c9cc643ddf9aed2cedc2e247b6429570..d40d508a11bf8af3e769410b0cad0abb0c6b2b8f 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -141,6 +141,10 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) uint32_t csum = 0; struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { + return false; + } + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { return false; } diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c index a7e123e60dbab97ebfafabb05fbbb99576af9cc9..7d574f487b8b6c00a561d7207a4db77e2dab71d3 100644 --- a/hw/net/pcnet.c +++ b/hw/net/pcnet.c @@ -632,7 +632,7 @@ static inline int ladr_match(PCNetState *s, const uint8_t *buf, int size) { struct qemu_ether_header *hdr = (void *)buf; if ((*(hdr->ether_dhost)&0x01) && - ((uint64_t *)&s->csr[8])[0] != 0LL) { + (s->csr[8] | s->csr[9] | s->csr[10] | s->csr[11]) != 0) { uint8_t ladr[8] = { s->csr[8] & 0xff, s->csr[8] >> 8, s->csr[9] & 0xff, s->csr[9] >> 8, diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c index 5e16056be661e54ff0da2ff8bdcf116055fe1e45..c25438ccccc3e18d8b18c9fb3148f7b1b8aa276b 100644 --- a/hw/net/rocker/rocker_of_dpa.c +++ b/hw/net/rocker/rocker_of_dpa.c @@ -2070,6 +2070,7 @@ static int of_dpa_cmd_add_l2_flood(OfDpa *of_dpa, OfDpaGroup *group, err_out: group->l2_flood.group_count = 0; g_free(group->l2_flood.group_ids); + group->l2_flood.group_ids = NULL; g_free(tlvs); return err; diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646219094995538af85e792f2e593269a..a02d65d208f8b8646c2beee91beef8b18109c77e 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -167,9 +167,26 @@ static int vhost_net_get_fd(NetClientState *backend) } } +static uint64_t vhost_get_mask_features(const int *feature_bits, uint64_t features) +{ + const int *bit = feature_bits; + uint64_t out_features = 0; + + while (*bit != VHOST_INVALID_FEATURE_BIT) { + uint64_t bit_mask = (1ULL << *bit); + if (features & bit_mask) { + out_features |= bit_mask; + } + bit++; + } + return out_features; +} + struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; + VirtIONet *n; + VirtIODevice *vdev; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_new0(struct vhost_net, 1); uint64_t features = 0; @@ -195,7 +212,46 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) net->backend = r; net->dev.protocol_features = 0; } else { - net->dev.backend_features = 0; + /* for ovs restart when vm start. + * Normal situation: + * 1.vm start. + * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000. + * 3.guest virtio-net mod load. qemu will call virtio_net_set_features set + * dev.acked_features to 0x40408000. + * 4.feature set to ovs's vhostuser(0x40408000). + * 5.ovs restart. + * 6.vhost_user_stop will save net->dev.acked_features(0x40408000) to + * VhostUserState's acked_features(0x40408000). + * 7.restart ok. + * 8.vhost_net_init fun call vhost_user_get_acked_features get the save + * features, and set to net->dev.acked_features. + * Abnormal situation: + * 1.vm start. + * 2.vhost_net_init init ok, then dev.acked_features is 0x40000000. + * 3.ovs restart. + * 4.vhost_user_stop will save net->dev.acked_features(0x40000000) to + * VhostUserState's acked_features(0x40000000). + * 5.guest virtio-net mod load. qemu will call virtio_net_set_features set + * dev.acked_features to 0x40408000. + * 6.restart ok. + * 7.vhost_net_init fun call vhost_user_get_acked_features get the save + * features(0x40000000), and set to net->dev.acked_features(0x40000000). + * 8.feature set to ovs's vhostuser(0x40000000). + * + * in abnormal situation, qemu set the wrong features to ovs's vhostuser, + * then the vm's network will be down. + * in abnormal situation, we found it just lost the guest feartures in + * acked_features, so hear we set the acked_features to vm's featrue + * just the same as guest virtio-net mod load. + */ + if (options->net_backend->peer) { + n = qemu_get_nic_opaque(options->net_backend->peer); + vdev = VIRTIO_DEVICE(n); + net->dev.backend_features = vhost_get_mask_features(vhost_net_get_feature_bits(net), + vdev->guest_features); + } else { + net->dev.backend_features = 0; + } net->dev.protocol_features = 0; net->backend = -1; @@ -403,7 +459,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, peer = qemu_get_peer(ncs, n->max_queue_pairs); } - if (peer->vring_enable) { + /* ovs needs to restore all states of vring */ + if (peer->vring_enable || + ncs[i].peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { /* restore vring enable state */ r = vhost_set_vring_enable(peer, peer->vring_enable); @@ -541,6 +599,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; + /* + * vhost-vdpa network devices need to enable dataplane virtqueues after + * DRIVER_OK, so they can recover device state before starting dataplane. + * Because of that, we don't enable virtqueues here and leave it to + * net/vhost-vdpa.c. + */ + if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { + return 0; + } + nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 80c56f0cfcf1675a1d1815489d0cfb94b525736e..25044385dc00008c71331789aa94f4ce0194895f 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -50,12 +50,11 @@ #define VIRTIO_NET_VM_VERSION 11 /* previously fixed value */ -#define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256 -#define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256 +#define VIRTIO_NET_VHOST_USER_DEFAULT_SIZE 2048 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */ -#define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE -#define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE +#define VIRTIO_NET_RX_QUEUE_MIN_SIZE 256 +#define VIRTIO_NET_TX_QUEUE_MIN_SIZE 256 #define VIRTIO_NET_IP4_ADDR_SIZE 8 /* ipv4 saddr + daddr */ @@ -674,6 +673,11 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, n->mergeable_rx_bufs = mergeable_rx_bufs; + /* + * Note: when extending the vnet header, please make sure to + * change the vnet header copying logic in virtio_net_flush_tx() + * as well. + */ if (version_1) { n->guest_hdr_len = hash_report ? sizeof(struct virtio_net_hdr_v1_hash) : @@ -696,6 +700,28 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs, } } +static void virtio_net_set_default_queue_size(VirtIONet *n) +{ + NetClientState *peer = n->nic_conf.peers.ncs[0]; + + /* Default value is 0 if not set */ + if (n->net_conf.rx_queue_size == 0) { + if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { + n->net_conf.rx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE; + } else { + n->net_conf.rx_queue_size = VIRTIO_NET_VQ_MAX_SIZE; + } + } + + if (n->net_conf.tx_queue_size == 0) { + if (peer && peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) { + n->net_conf.tx_queue_size = VIRTIO_NET_VHOST_USER_DEFAULT_SIZE; + } else { + n->net_conf.tx_queue_size = VIRTIO_NET_VQ_MAX_SIZE; + } + } +} + static int virtio_net_max_tx_queue_size(VirtIONet *n) { NetClientState *peer = n->nic_conf.peers.ncs[0]; @@ -705,15 +731,16 @@ static int virtio_net_max_tx_queue_size(VirtIONet *n) * size. */ if (!peer) { - return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; } switch(peer->info->type) { case NET_CLIENT_DRIVER_VHOST_USER: + return VIRTIO_NET_VQ_MAX_SIZE; case NET_CLIENT_DRIVER_VHOST_VDPA: - return VIRTQUEUE_MAX_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; default: - return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; + return VIRTIO_NET_VQ_MAX_SIZE; }; } @@ -1373,17 +1400,17 @@ static uint16_t virtio_net_handle_rss(VirtIONet *n, n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types); n->rss_data.indirections_len = virtio_lduw_p(vdev, &cfg.indirection_table_mask); - n->rss_data.indirections_len++; if (!do_rss) { - n->rss_data.indirections_len = 1; + n->rss_data.indirections_len = 0; } - if (!is_power_of_2(n->rss_data.indirections_len)) { - err_msg = "Invalid size of indirection table"; + if (n->rss_data.indirections_len >= VIRTIO_NET_RSS_MAX_TABLE_LEN) { + err_msg = "Too large indirection table"; err_value = n->rss_data.indirections_len; goto error; } - if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) { - err_msg = "Too large indirection table"; + n->rss_data.indirections_len++; + if (!is_power_of_2(n->rss_data.indirections_len)) { + err_msg = "Invalid size of indirection table"; err_value = n->rss_data.indirections_len; goto error; } @@ -1635,24 +1662,28 @@ static bool virtio_net_can_receive(NetClientState *nc) static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize) { + int opaque; + unsigned int in_bytes; VirtIONet *n = q->n; - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { - virtio_queue_set_notification(q->rx_vq, 1); - - /* To avoid a race condition where the guest has made some buffers - * available after the above check but before notification was - * enabled, check for available buffers again. - */ - if (virtio_queue_empty(q->rx_vq) || - (n->mergeable_rx_bufs && - !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) { + + while (virtio_queue_empty(q->rx_vq) || n->mergeable_rx_bufs) { + opaque = virtqueue_get_avail_bytes(q->rx_vq, &in_bytes, NULL, + bufsize, 0); + /* Buffer is enough, disable notifiaction */ + if (bufsize <= in_bytes) { + break; + } + + if (virtio_queue_enable_notification_and_check(q->rx_vq, opaque)) { + /* Guest has added some buffers, try again */ + continue; + } else { return 0; } } virtio_queue_set_notification(q->rx_vq, 0); + return 1; } @@ -1904,7 +1935,8 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) { int index = virtio_net_process_rss(nc, buf, size); if (index >= 0) { - NetClientState *nc2 = qemu_get_subqueue(n->nic, index); + NetClientState *nc2 = + qemu_get_subqueue(n->nic, index % n->curr_queue_pairs); return virtio_net_receive_rcu(nc2, buf, size, true); } } @@ -1964,7 +1996,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, sg, elem->in_num, offsetof(typeof(mhdr), num_buffers), sizeof(mhdr.num_buffers)); - } + }else { + mhdr.num_buffers = cpu_to_le16(1); + } receive_header(n, sg, elem->in_num, buf, size); if (n->rss_data.populate_hash) { @@ -2114,7 +2148,7 @@ static void virtio_net_rsc_purge(void *opq) chain->stat.timer++; if (!QTAILQ_EMPTY(&chain->buffers)) { timer_mod(chain->drain_timer, - qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout); + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout); } } @@ -2350,7 +2384,7 @@ static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain, chain->stat.empty_cache++; virtio_net_rsc_cache_buf(chain, nc, buf, size); timer_mod(chain->drain_timer, - qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout); + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + chain->n->rsc_timeout); return size; } @@ -2588,7 +2622,7 @@ static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n, chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD; chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; } - chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST, + chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_rsc_purge, chain); memset(&chain->stat, 0, sizeof(chain->stat)); @@ -2693,7 +2727,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) ssize_t ret; unsigned int out_num; struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg; - struct virtio_net_hdr_mrg_rxbuf mhdr; + struct virtio_net_hdr_v1_hash vhdr; elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement)); if (!elem) { @@ -2704,22 +2738,18 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) out_sg = elem->out_sg; if (out_num < 1) { virtio_error(vdev, "virtio-net header not in first element"); - virtqueue_detach_element(q->tx_vq, elem, 0); - g_free(elem); - return -EINVAL; + goto detach; } if (n->has_vnet_hdr) { - if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) < + if (iov_to_buf(out_sg, out_num, 0, &vhdr, n->guest_hdr_len) < n->guest_hdr_len) { virtio_error(vdev, "virtio-net header incorrect"); - virtqueue_detach_element(q->tx_vq, elem, 0); - g_free(elem); - return -EINVAL; + goto detach; } if (n->needs_vnet_hdr_swap) { - virtio_net_hdr_swap(vdev, (void *) &mhdr); - sg2[0].iov_base = &mhdr; + virtio_net_hdr_swap(vdev, (void *) &vhdr); + sg2[0].iov_base = &vhdr; sg2[0].iov_len = n->guest_hdr_len; out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1, out_sg, out_num, @@ -2746,6 +2776,11 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q) n->guest_hdr_len, -1); out_num = sg_num; out_sg = sg; + + if (out_num < 1) { + virtio_error(vdev, "virtio-net nothing to send"); + goto detach; + } } ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), @@ -2766,6 +2801,11 @@ drop: } } return num_packets; + +detach: + virtqueue_detach_element(q->tx_vq, elem, 0); + g_free(elem); + return -EINVAL; } static void virtio_net_tx_timer(void *opaque); @@ -2804,6 +2844,10 @@ static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) VirtIONet *n = VIRTIO_NET(vdev); VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; + if (unlikely(n->vhost_started)) { + return; + } + if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) { virtio_net_drop_tx_queue_data(vdev, vq); return; @@ -3632,18 +3676,20 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) virtio_net_set_config_size(n, n->host_features); virtio_init(vdev, VIRTIO_ID_NET, n->config_size); + virtio_net_set_default_queue_size(n); + /* * We set a lower limit on RX queue size to what it always was. * Guests that want a smaller ring can always resize it without * help from us (using virtio 1 and up). */ if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE || - n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE || + n->net_conf.rx_queue_size > VIRTIO_NET_VQ_MAX_SIZE || !is_power_of_2(n->net_conf.rx_queue_size)) { error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), " "must be a power of 2 between %d and %d.", n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE, - VIRTQUEUE_MAX_SIZE); + VIRTIO_NET_VQ_MAX_SIZE); virtio_cleanup(vdev); return; } @@ -3933,10 +3979,8 @@ static Property virtio_net_properties[] = { TX_TIMER_INTERVAL), DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST), DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx), - DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, - VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE), - DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size, - VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE), + DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size, 0), + DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size, 0), DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0), DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend, true), @@ -3952,6 +3996,46 @@ static Property virtio_net_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void virtio_net_print_features(uint64_t features) +{ + Property *props = virtio_net_properties; + int feature_cnt = 0; + + if (!features) { + return; + } + printf("virtio_net_feature: "); + + for (; features && props->name; props++) { + /* The bitnr of property may be default(0) besides 'csum' property. */ + if (props->bitnr == 0 && strcmp(props->name, "csum")) { + continue; + } + + /* Features only support 64bit. */ + if (props->bitnr > 63) { + continue; + } + + if (virtio_has_feature(features, props->bitnr)) { + virtio_clear_feature(&features, props->bitnr); + if (feature_cnt != 0) { + printf(", "); + } + printf("%s", props->name); + feature_cnt++; + } + } + + if (features) { + if (feature_cnt != 0) { + printf(", "); + } + printf("unkown bits 0x%." PRIx64, features); + } + printf("\n"); +} + static void virtio_net_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3966,6 +4050,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data) vdc->set_config = virtio_net_set_config; vdc->get_features = virtio_net_get_features; vdc->set_features = virtio_net_set_features; + vdc->print_features = virtio_net_print_features; vdc->bad_features = virtio_net_bad_features; vdc->reset = virtio_net_reset; vdc->queue_reset = virtio_net_queue_reset; diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index f026245d1e9eb3a14d39f651c4a9d35c9d2cd201..6fc2a64b0e5a85470ee29def89bb959f563cdb7d 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1516,9 +1516,16 @@ static void nvme_post_cqes(void *opaque) stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); break; } + QTAILQ_REMOVE(&cq->req_list, req, entry); + nvme_inc_cq_tail(cq); nvme_sg_unmap(&req->sg); + + if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) { + qemu_bh_schedule(sq->bh); + } + QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -2592,6 +2599,7 @@ next: done: iocb->aiocb = NULL; iocb->common.cb(iocb->common.opaque, iocb->ret); + g_free(iocb->range); qemu_aio_unref(iocb); } @@ -2855,7 +2863,7 @@ static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns, uint32_t nlb; nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL, &nlb, NULL, NULL, NULL); - copy_len += nlb + 1; + copy_len += nlb; } if (copy_len > ns->id_ns.mcl) { @@ -4301,7 +4309,7 @@ static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req, nruhsd = ns->fdp.nphs * endgrp->fdp.nrg; trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr); - buf = g_malloc(trans_len); + buf = g_malloc0(trans_len); trans_len = MIN(trans_len, len); @@ -4352,7 +4360,7 @@ static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns = req->ns; uint32_t cdw10 = le32_to_cpu(cmd->cdw10); uint16_t ret = NVME_SUCCESS; - uint32_t npid = (cdw10 >> 1) + 1; + uint32_t npid = (cdw10 >> 16) + 1; unsigned int i = 0; g_autofree uint16_t *pids = NULL; uint32_t maxnpid; @@ -5882,7 +5890,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t nsid = le32_to_cpu(cmd->nsid); - uint32_t result; + uint32_t result = 0; uint8_t fid = NVME_GETSETFEAT_FID(dw10); NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; @@ -7574,7 +7582,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) /* Completion queue doorbell write */ uint16_t new_head = val & 0xffff; - int start_sqs; NvmeCQueue *cq; qid = (addr - (0x1000 + (1 << 2))) >> 3; @@ -7625,18 +7632,15 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); - start_sqs = nvme_cq_full(cq) ? 1 : 0; + /* scheduled deferred cqe posting if queue was previously full */ + if (nvme_cq_full(cq)) { + qemu_bh_schedule(cq->bh); + } + cq->head = new_head; if (!qid && n->dbbuf_enabled) { stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED); } - if (start_sqs) { - NvmeSQueue *sq; - QTAILQ_FOREACH(sq, &cq->sq_list, entry) { - qemu_bh_schedule(sq->bh); - } - qemu_bh_schedule(cq->bh); - } if (cq->tail == cq->head) { if (cq->irq_enabled) { @@ -7924,7 +7928,7 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); QTAILQ_INIT(&n->aer_queue); - list->numcntl = cpu_to_le16(max_vfs); + list->numcntl = max_vfs; for (i = 0; i < max_vfs; i++) { sctrl = &list->sec[i]; sctrl->pcid = cpu_to_le16(n->cntlid); @@ -8466,36 +8470,26 @@ static void nvme_pci_reset(DeviceState *qdev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); } -static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address, - uint32_t val, int len) +static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs) { NvmeCtrl *n = NVME(dev); NvmeSecCtrlEntry *sctrl; - uint16_t sriov_cap = dev->exp.sriov_cap; - uint32_t off = address - sriov_cap; - int i, num_vfs; - - if (!sriov_cap) { - return; - } + int i; - if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - for (i = 0; i < num_vfs; i++) { - sctrl = &n->sec_ctrl_list.sec[i]; - nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); - } - } + for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) { + sctrl = &n->sec_ctrl_list.sec[i]; + nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false); } } static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, uint32_t val, int len) { - nvme_sriov_pre_write_ctrl(dev, address, val, len); + uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); + nvme_sriov_post_write_config(dev, old_num_vfs); } static const VMStateDescription nvme_vmstate = { diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 4e4524673a39add1b830723d4e2e6d10a86ab582..d32079ebdfab493a7eb61e9cbedc6bd2d3dbfd0b 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -729,7 +729,6 @@ static void *fw_cfg_modify_bytes_read(FWCfgState *s, uint16_t key, ptr = s->entries[arch][key].data; s->entries[arch][key].data = data; s->entries[arch][key].len = len; - s->entries[arch][key].callback_opaque = NULL; s->entries[arch][key].allow_write = false; return ptr; diff --git a/hw/pci-bridge/Kconfig b/hw/pci-bridge/Kconfig index 67077366cc7daa00b04a140627e30e79fe1fe789..449ec9864357c77ab870dd2e7f91e68ae1739cca 100644 --- a/hw/pci-bridge/Kconfig +++ b/hw/pci-bridge/Kconfig @@ -1,3 +1,8 @@ +config PCI_BRIDGE + bool + default y if PCI_DEVICES + depends on PCI + config PCIE_PORT bool default y if PCI_DEVICES diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c index 1ce4e7bebae7ef6719d9ec932e132ba04b69fe73..1e1ab5bb19fe093daa9533f7963f032696038a2c 100644 --- a/hw/pci-bridge/gen_pcie_root_port.c +++ b/hw/pci-bridge/gen_pcie_root_port.c @@ -145,6 +145,8 @@ static Property gen_rp_props[] = { speed, PCIE_LINK_SPEED_16), DEFINE_PROP_PCIE_LINK_WIDTH("x-width", PCIESlot, width, PCIE_LINK_WIDTH_32), + DEFINE_PROP_UINT8("fast-plug", PCIESlot, fast_plug, 0), + DEFINE_PROP_UINT8("fast-unplug", PCIESlot, fast_unplug, 0), DEFINE_PROP_END_OF_LIST() }; diff --git a/hw/pci-bridge/meson.build b/hw/pci-bridge/meson.build index 6d5ad9f37b226c143d51ff60d87f76de59c78801..a8b88e909998cb71c9e9119784cce551bc17b01b 100644 --- a/hw/pci-bridge/meson.build +++ b/hw/pci-bridge/meson.build @@ -1,5 +1,5 @@ pci_ss = ss.source_set() -pci_ss.add(files('pci_bridge_dev.c')) +pci_ss.add(when: 'CONFIG_PCI_BRIDGE', if_true: files('pci_bridge_dev.c')) pci_ss.add(when: 'CONFIG_I82801B11', if_true: files('i82801b11.c')) pci_ss.add(when: 'CONFIG_IOH3420', if_true: files('ioh3420.c')) pci_ss.add(when: 'CONFIG_PCIE_PORT', if_true: files('pcie_root_port.c', 'gen_pcie_root_port.c')) diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index f477f97847d753341f341281ed2abed2ce242574..004142709cd6287d7f03133c6d2987fa8b923689 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -360,7 +360,7 @@ static void designware_pcie_root_config_write(PCIDevice *d, uint32_t address, case DESIGNWARE_PCIE_ATU_UPPER_TARGET: viewport->target &= 0x00000000FFFFFFFFULL; - viewport->target |= val; + viewport->target |= (uint64_t)val << 32; break; case DESIGNWARE_PCIE_ATU_LIMIT: diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index 1092dc3b7082bc8cedbcb1d697b729ead355fd54..162f6221abde87690d160fc0167fd65287bbd957 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -49,9 +49,10 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) } } -static void acpi_dsdt_add_pci_osc(Aml *dev) +static void acpi_dsdt_add_pci_osc(Aml *dev, bool preserve_config) { Aml *method, *UUID, *ifctx, *ifctx1, *elsectx, *buf; + uint8_t byte_list[1] = {0}; /* Declare an _OSC (OS Control Handoff) method */ aml_append(dev, aml_name_decl("SUPP", aml_int(0))); @@ -113,10 +114,24 @@ static void acpi_dsdt_add_pci_osc(Aml *dev) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {1}; + if (preserve_config) { + /* support for functions other than function 0 and function 5 */ + byte_list[0] = 0x21; + } buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); + + if (preserve_config) { + Aml *ifctx2 = aml_if(aml_equal(aml_arg(2), aml_int(5))); + /* + * 0 - The operating system must not ignore the PCI configuration that + * firmware has done at boot time. + */ + aml_append(ifctx2, aml_return(aml_int(0))); + aml_append(ifctx, ifctx2); + } + aml_append(method, ifctx); byte_list[0] = 0; @@ -188,7 +203,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) if (is_cxl) { build_cxl_osc_method(dev); } else { - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); } aml_append(scope, dev); @@ -263,7 +278,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) } aml_append(dev, aml_name_decl("_CRS", rbuf)); - acpi_dsdt_add_pci_osc(dev); + acpi_dsdt_add_pci_osc(dev, cfg->preserve_config); Aml *dev_res0 = aml_device("%s", "RES0"); aml_append(dev_res0, aml_name_decl("_HID", aml_string("PNP0C02"))); diff --git a/hw/pci/pci.c b/hw/pci/pci.c index c49417abb2df4dba40dcc52e61ee94c36300d328..447ef2b16364c0de83a859845c29748d0092c3dc 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -85,6 +85,8 @@ static Property pci_props[] = { QEMU_PCIE_ERR_UNC_MASK_BITNR, true), DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), DEFINE_PROP_END_OF_LIST() }; @@ -1201,6 +1203,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, "bus master container", UINT64_MAX); address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_container_region, pci_dev->name); + pci_dev->bus_master_as.max_bounce_buffer_size = + pci_dev->max_bounce_buffer_size; if (phase_check(PHASE_MACHINE_READY)) { pci_init_bus_master(pci_dev); @@ -2411,6 +2415,7 @@ static void pci_add_option_rom(PCIDevice *pdev, bool is_default_rom, snprintf(name, sizeof(name), "%s.rom", vmsd ? vmsd->name : object_get_typename(OBJECT(pdev))); + qemu_log("add rom file: %s\n", name); pdev->has_rom = true; memory_region_init_rom(&pdev->rom, OBJECT(pdev), name, pdev->romsize, &error_fatal); @@ -2657,6 +2662,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data) k->unrealize = pci_qdev_unrealize; k->bus_type = TYPE_PCI_BUS; device_class_set_props(k, pci_props); + object_class_property_set_description( + klass, "x-max-bounce-buffer-size", + "Maximum buffer size allocated for bounce buffers used for mapped " + "access to indirect DMA memory"); } static void pci_device_class_base_init(ObjectClass *klass, void *data) @@ -2672,11 +2681,27 @@ static void pci_device_class_base_init(ObjectClass *klass, void *data) } } -AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +/* + * Get IOMMU root bus, aliased bus and devfn of a PCI device + * + * IOMMU root bus is needed by all call sites to call into iommu_ops. + * For call sites which don't need aliased BDF, passing NULL to + * aliased_[bus|devfn] is allowed. + * + * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device. + * + * @aliased_bus: return aliased #PCIBus of the PCI device, optional. + * + * @aliased_devfn: return aliased devfn of the PCI device, optional. + */ +static void pci_device_get_iommu_bus_devfn(PCIDevice *dev, + PCIBus **piommu_bus, + PCIBus **aliased_bus, + int *aliased_devfn) { PCIBus *bus = pci_get_bus(dev); PCIBus *iommu_bus = bus; - uint8_t devfn = dev->devfn; + int devfn = dev->devfn; while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) { PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev); @@ -2717,13 +2742,79 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) { + + assert(0 <= devfn && devfn < PCI_DEVFN_MAX); + assert(iommu_bus); + + if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) { + iommu_bus = NULL; + } + + *piommu_bus = iommu_bus; + + if (aliased_bus) { + *aliased_bus = bus; + } + + if (aliased_devfn) { + *aliased_devfn = devfn; + } +} + +AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); + if (iommu_bus) { return iommu_bus->iommu_ops->get_address_space(bus, iommu_bus->iommu_opaque, devfn); } return &address_space_memory; } +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp) +{ + PCIBus *iommu_bus; + + /* set_iommu_device requires device's direct BDF instead of aliased BDF */ + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->set_iommu_device) { + return iommu_bus->iommu_ops->set_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn, hiod, errp); + } + return true; +} + +void pci_device_unset_iommu_device(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->unset_iommu_device) { + return iommu_bus->iommu_ops->unset_iommu_device(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } +} + +bool pci_device_get_pasid_cap(PCIDevice *dev) +{ + PCIBus *iommu_bus; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, NULL, NULL); + if (iommu_bus && iommu_bus->iommu_ops->get_pasid_cap) { + return iommu_bus->iommu_ops->get_pasid_cap(pci_get_bus(dev), + iommu_bus->iommu_opaque, + dev->devfn); + } + return false; +} + void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque) { /* diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 6db0cf69cd8a83ae2d439b3e194f4a8cb70ee244..a5b4e54bd775e02ee493a4fca2aeeaee3df9d575 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -97,13 +97,6 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) return; } - /* Clear and fill LNKCAP from what was configured above */ - pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP, - PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); - pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, - QEMU_PCI_EXP_LNKCAP_MLW(s->width) | - QEMU_PCI_EXP_LNKCAP_MLS(s->speed)); - /* * Link bandwidth notification is required for all root ports and * downstream ports supporting links wider than x1 or multiple link @@ -111,6 +104,12 @@ static void pcie_cap_fill_slot_lnk(PCIDevice *dev) */ if (s->width > QEMU_PCI_EXP_LNK_X1 || s->speed > QEMU_PCI_EXP_LNK_2_5GT) { + /* Clear and fill LNKCAP from what was configured above */ + pci_long_test_and_clear_mask(exp_cap + PCI_EXP_LNKCAP, + PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS); + pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, + QEMU_PCI_EXP_LNKCAP_MLW(s->width) | + QEMU_PCI_EXP_LNKCAP_MLS(s->speed)); pci_long_test_and_set_mask(exp_cap + PCI_EXP_LNKCAP, PCI_EXP_LNKCAP_LBNC); } @@ -556,6 +555,7 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, uint8_t *exp_cap = hotplug_pdev->config + hotplug_pdev->exp.exp_cap; uint32_t sltcap = pci_get_word(exp_cap + PCI_EXP_SLTCAP); uint16_t sltctl = pci_get_word(exp_cap + PCI_EXP_SLTCTL); + PCIESlot *s = PCIE_SLOT(hotplug_pdev); /* Check if hot-unplug is disabled on the slot */ if ((sltcap & PCI_EXP_SLTCAP_HPC) == 0) { @@ -601,7 +601,17 @@ void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, return; } - pcie_cap_slot_push_attention_button(hotplug_pdev); + if ((pci_dev->cap_present & QEMU_PCIE_LNKSTA_DLLLA) && s->fast_plug) { + pci_word_test_and_clear_mask(pci_dev->config + pci_dev->exp.exp_cap + PCI_EXP_LNKSTA, + PCI_EXP_LNKSTA_DLLLA); + } + + if (s->fast_unplug) { + pcie_cap_slot_event(hotplug_pdev, + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } else { + pcie_cap_slot_push_attention_button(hotplug_pdev); + } } /* pci express slot for pci express root/downstream port @@ -1113,3 +1123,15 @@ void pcie_acs_reset(PCIDevice *dev) pci_set_word(dev->config + dev->exp.acs_cap + PCI_ACS_CTRL, 0); } } + +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps) +{ + pcie_add_capability(dev, PCI_EXT_CAP_ID_PASID, 1, + offset, PCI_EXT_CAP_PASID_SIZEOF); + + dev->exp.pasid_cap = offset; + + pci_set_word(dev->config + offset + PCI_PASID_CAP, caps); + + pci_set_word(dev->wmask + dev->exp.pasid_cap + PCI_PASID_CTRL, 0x7); +} diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index a1fe65f5d801dffae113f3231aadf934913bbfc9..da209b7f47fd38d33927281d79af2f4696b54ae3 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -176,6 +176,9 @@ static void register_vfs(PCIDevice *dev) assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + return; + } dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 384226296bfba2ffdcfe740296780ac65b8f854f..7722e52fa40b3fd3a95486afcf3adc930487d0b5 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -203,6 +203,8 @@ static void dt_i2c_create(void *fdt, const char *soc, const char *mpic, qemu_fdt_setprop_cells(fdt, i2c, "cell-index", 0); qemu_fdt_setprop_cells(fdt, i2c, "interrupts", irq0, 0x2); qemu_fdt_setprop_phandle(fdt, i2c, "interrupt-parent", mpic); + qemu_fdt_setprop_cell(fdt, i2c, "#size-cells", 0); + qemu_fdt_setprop_cell(fdt, i2c, "#address-cells", 1); qemu_fdt_setprop_string(fdt, "/aliases", alias, i2c); g_free(i2c); @@ -832,7 +834,7 @@ static DeviceState *ppce500_init_mpic_qemu(PPCE500MachineState *pms, } static DeviceState *ppce500_init_mpic_kvm(const PPCE500MachineClass *pmc, - IrqLines *irqs, Error **errp) + Error **errp) { #ifdef CONFIG_KVM DeviceState *dev; @@ -872,7 +874,7 @@ static DeviceState *ppce500_init_mpic(PPCE500MachineState *pms, Error *err = NULL; if (kvm_kernel_irqchip_allowed()) { - dev = ppce500_init_mpic_kvm(pmc, irqs, &err); + dev = ppce500_init_mpic_kvm(pmc, &err); } if (kvm_kernel_irqchip_required() && !dev) { error_reportf_err(err, @@ -1024,7 +1026,7 @@ void ppce500_init(MachineState *machine) sysbus_connect_irq(s, 0, qdev_get_gpio_in(mpicdev, MPC8544_I2C_IRQ)); memory_region_add_subregion(ccsr_addr_space, MPC8544_I2C_REGS_OFFSET, sysbus_mmio_get_region(s, 0)); - i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c"); + i2c = I2C_BUS(qdev_get_child_bus(dev, "i2c")); i2c_slave_create_simple(i2c, "ds1338", RTC_REGS_OFFSET); /* eSDHC */ @@ -1073,7 +1075,7 @@ void ppce500_init(MachineState *machine) memory_region_add_subregion(ccsr_addr_space, MPC8544_PCI_REGS_OFFSET, sysbus_mmio_get_region(s, 0)); - pci_bus = (PCIBus *)qdev_get_child_bus(dev, "pci.0"); + pci_bus = PCI_BUS(qdev_get_child_bus(dev, "pci.0")); if (!pci_bus) printf("couldn't create PCI controller!\n"); diff --git a/hw/ppc/pnv_i2c.c b/hw/ppc/pnv_i2c.c index 656a48eebe51597e9fee89f4ccbceb8bd3b14804..0ac6aa5c06f055d1850cb550c87856772326155e 100644 --- a/hw/ppc/pnv_i2c.c +++ b/hw/ppc/pnv_i2c.c @@ -673,6 +673,9 @@ static void pnv_i2c_class_init(ObjectClass *klass, void *data) xscomc->dt_xscom = pnv_i2c_dt_xscom; + /* Reason: This device is part of the CPU and cannot be used separately */ + dc->user_creatable = false; + dc->desc = "PowerNV I2C"; dc->realize = pnv_i2c_realize; device_class_set_props(dc, pnv_i2c_properties); diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index f283f7e38d619fd65985146b5e1572c221f95c53..d1d07bec4644da4ae6a99d3357d6d17ff66264de 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -84,27 +84,27 @@ static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op) static VFIOContainer *vfio_eeh_as_container(AddressSpace *as) { VFIOAddressSpace *space = vfio_get_address_space(as); - VFIOContainer *container = NULL; + VFIOContainerBase *bcontainer = NULL; if (QLIST_EMPTY(&space->containers)) { /* No containers to act on */ goto out; } - container = QLIST_FIRST(&space->containers); + bcontainer = QLIST_FIRST(&space->containers); - if (QLIST_NEXT(container, next)) { + if (QLIST_NEXT(bcontainer, next)) { /* * We don't yet have logic to synchronize EEH state across * multiple containers */ - container = NULL; + bcontainer = NULL; goto out; } out: vfio_put_address_space(space); - return container; + return container_of(bcontainer, VFIOContainer, bcontainer); } static bool vfio_eeh_as_ok(AddressSpace *as) diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c index e3b430a81f4f195e6aaf967bc45cc80b42f3c26c..b5b6514d79fcd71afece90da18e83ed35d37d6b9 100644 --- a/hw/ppc/vof.c +++ b/hw/ppc/vof.c @@ -646,7 +646,7 @@ static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base) mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen); g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc)); if (sc == 2) { - mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac)); + mem0_end = ldq_be_p(mem0_reg + sizeof(uint32_t) * ac); } else { mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac)); } diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c index 8b10c32a3c6eee5446ed3a0ee54433b8e577355e..8b708422fe5291511b94d6c62f2bfa47562396df 100644 --- a/hw/remote/vfio-user-obj.c +++ b/hw/remote/vfio-user-obj.c @@ -281,7 +281,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, while (bytes > 0) { len = (bytes > pci_access_width) ? pci_access_width : bytes; if (is_write) { - memcpy(&val, ptr, len); + val = ldn_le_p(ptr, len); pci_host_config_write_common(o->pci_dev, offset, pci_config_size(o->pci_dev), val, len); @@ -289,7 +289,7 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, } else { val = pci_host_config_read_common(o->pci_dev, offset, pci_config_size(o->pci_dev), len); - memcpy(ptr, &val, len); + stn_le_p(ptr, len, val); trace_vfu_cfg_read(offset, val); } offset += len; diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig index b6a5eb4452e44fd95ac2a78118295fe4f3475d66..1e11ac9432274f26c8febea25fba09bb597aebeb 100644 --- a/hw/riscv/Kconfig +++ b/hw/riscv/Kconfig @@ -41,6 +41,7 @@ config RISCV_VIRT select RISCV_IMSIC select SIFIVE_PLIC select SIFIVE_TEST + select SMBIOS select VIRTIO_MMIO select FW_CFG_DMA select PLATFORM_BUS diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index d2eac2415619c704349a6bdad23a2fb2cf27ffd2..9b29ed1108f6ceefac3bc0ac48430783f6e8154b 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -36,6 +36,7 @@ #include "hw/riscv/boot.h" #include "hw/riscv/numa.h" #include "kvm/kvm_riscv.h" +#include "hw/firmware/smbios.h" #include "hw/intc/riscv_aclint.h" #include "hw/intc/riscv_aplic.h" #include "hw/intc/riscv_imsic.h" @@ -1249,6 +1250,45 @@ static void create_platform_bus(RISCVVirtState *s, DeviceState *irqchip) sysbus_mmio_get_region(sysbus, 0)); } +static void virt_build_smbios(RISCVVirtState *s) +{ + MachineClass *mc = MACHINE_GET_CLASS(s); + MachineState *ms = MACHINE(s); + uint8_t *smbios_tables, *smbios_anchor; + size_t smbios_tables_len, smbios_anchor_len; + struct smbios_phys_mem_area mem_array; + const char *product = "QEMU Virtual Machine"; + + if (kvm_enabled()) { + product = "KVM Virtual Machine"; + } + + smbios_set_defaults("QEMU", product, mc->name, false, + true, SMBIOS_ENTRY_POINT_TYPE_64); + + if (riscv_is_32bit(&s->soc[0])) { + smbios_set_default_processor_family(0x200); + } else { + smbios_set_default_processor_family(0x201); + } + + /* build the array of physical mem area from base_memmap */ + mem_array.address = s->memmap[VIRT_DRAM].base; + mem_array.length = ms->ram_size; + + smbios_get_tables(ms, &mem_array, 1, + &smbios_tables, &smbios_tables_len, + &smbios_anchor, &smbios_anchor_len, + &error_fatal); + + if (smbios_anchor) { + fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-tables", + smbios_tables, smbios_tables_len); + fw_cfg_add_file(s->fw_cfg, "etc/smbios/smbios-anchor", + smbios_anchor, smbios_anchor_len); + } +} + static void virt_machine_done(Notifier *notifier, void *data) { RISCVVirtState *s = container_of(notifier, RISCVVirtState, @@ -1337,6 +1377,8 @@ static void virt_machine_done(Notifier *notifier, void *data) riscv_setup_direct_kernel(kernel_entry, fdt_load_addr); } + virt_build_smbios(s); + if (virt_is_acpi_enabled(s)) { virt_acpi_setup(s); } diff --git a/hw/rtc/ls7a_rtc.c b/hw/rtc/ls7a_rtc.c index 1f9e38a735bd75886908e7009f14a26ca1e6261d..be9546c8505d7af32603b39a9df63c295530e66d 100644 --- a/hw/rtc/ls7a_rtc.c +++ b/hw/rtc/ls7a_rtc.c @@ -145,20 +145,22 @@ static void toymatch_write(LS7ARtcState *s, uint64_t val, int num) now = qemu_clock_get_ms(rtc_clock); toymatch_val_to_time(s, val, &tm); expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000; - timer_mod(s->toy_timer[num], expire_time); + if (expire_time > now) + timer_mod(s->toy_timer[num], expire_time); } } static void rtcmatch_write(LS7ARtcState *s, uint64_t val, int num) { - uint64_t expire_ns; + int64_t expire_ns; /* it do not support write when toy disabled */ if (rtc_enabled(s)) { s->rtcmatch[num] = val; /* calculate expire time */ expire_ns = ticks_to_ns(val) - ticks_to_ns(s->offset_rtc); - timer_mod_ns(s->rtc_timer[num], expire_ns); + if (expire_ns > 0) + timer_mod_ns(s->rtc_timer[num], expire_ns); } } @@ -185,7 +187,7 @@ static void ls7a_rtc_stop(LS7ARtcState *s) static void ls7a_toy_start(LS7ARtcState *s) { int i; - uint64_t expire_time, now; + int64_t expire_time, now; struct tm tm = {}; now = qemu_clock_get_ms(rtc_clock); @@ -194,19 +196,21 @@ static void ls7a_toy_start(LS7ARtcState *s) for (i = 0; i < TIMER_NUMS; i++) { toymatch_val_to_time(s, s->toymatch[i], &tm); expire_time = now + (qemu_timedate_diff(&tm) - s->offset_toy) * 1000; - timer_mod(s->toy_timer[i], expire_time); + if (expire_time > now) + timer_mod(s->toy_timer[i], expire_time); } } static void ls7a_rtc_start(LS7ARtcState *s) { int i; - uint64_t expire_time; + int64_t expire_time; /* recalculate expire time and enable timer */ for (i = 0; i < TIMER_NUMS; i++) { expire_time = ticks_to_ns(s->rtcmatch[i]) - ticks_to_ns(s->offset_rtc); - timer_mod_ns(s->rtc_timer[i], expire_time); + if (expire_time > 0) + timer_mod_ns(s->rtc_timer[i], expire_time); } } @@ -370,7 +374,7 @@ static void toy_timer_cb(void *opaque) LS7ARtcState *s = opaque; if (toy_enabled(s)) { - qemu_irq_raise(s->irq); + qemu_irq_pulse(s->irq); } } @@ -379,7 +383,7 @@ static void rtc_timer_cb(void *opaque) LS7ARtcState *s = opaque; if (rtc_enabled(s)) { - qemu_irq_raise(s->irq); + qemu_irq_pulse(s->irq); } } diff --git a/hw/rtc/mc146818rtc.c b/hw/rtc/mc146818rtc.c index 2d391a83969e92529125a9f3f7ab846d7f84cabf..e61c76d0605ed9e5e1cafaf816bc14a4955a6d23 100644 --- a/hw/rtc/mc146818rtc.c +++ b/hw/rtc/mc146818rtc.c @@ -606,7 +606,8 @@ static void rtc_set_time(MC146818RtcState *s) s->base_rtc = mktimegm(&tm); s->last_update = qemu_clock_get_ns(rtc_clock); - qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path); + set_rtc_date_diff(qemu_timedate_diff(&tm)); + qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path); } static void rtc_set_cmos(MC146818RtcState *s, const struct tm *tm) diff --git a/hw/rtc/pl031.c b/hw/rtc/pl031.c index b01d0e75d1af40334289df94795fa5e5e3c513e6..57e9a3561663bce6e8002f26bbb2fc6ef0978735 100644 --- a/hw/rtc/pl031.c +++ b/hw/rtc/pl031.c @@ -63,6 +63,15 @@ static uint32_t pl031_get_count(PL031State *s) return s->tick_offset + now / NANOSECONDS_PER_SECOND; } +static void pl031_get_date(Object *obj, struct tm *current_tm, Error **errp) +{ + PL031State *s = PL031(obj); + time_t ti = pl031_get_count(s); + + /* Changed to UTC time */ + gmtime_r(&ti, current_tm); +} + static void pl031_set_alarm(PL031State *s) { uint32_t ticks; @@ -144,7 +153,8 @@ static void pl031_write(void * opaque, hwaddr offset, s->tick_offset += value - pl031_get_count(s); qemu_get_timedate(&tm, s->tick_offset); - qapi_event_send_rtc_change(qemu_timedate_diff(&tm), qom_path); + set_rtc_date_diff(qemu_timedate_diff(&tm)); + qapi_event_send_rtc_change(get_rtc_date_diff(), qom_path); pl031_set_alarm(s); break; @@ -201,6 +211,20 @@ static void pl031_init(Object *obj) qemu_clock_get_ns(rtc_clock) / NANOSECONDS_PER_SECOND; s->timer = timer_new_ns(rtc_clock, pl031_interrupt, s); + object_property_add_tm(OBJECT(s), "date", pl031_get_date); +} + +static void pl031_realize(DeviceState *d, Error **errp) +{ + object_property_add_alias(qdev_get_machine(), "rtc-time", + OBJECT(d), "date"); +} + +static void pl031_unrealize(DeviceState *d) +{ + if (object_property_find(qdev_get_machine(), "rtc-time")) { + object_property_del(qdev_get_machine(), "rtc-time"); + } } static void pl031_finalize(Object *obj) @@ -337,6 +361,8 @@ static void pl031_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); dc->vmsd = &vmstate_pl031; + dc->realize = pl031_realize; + dc->unrealize = pl031_unrealize; device_class_set_props(dc, pl031_properties); } diff --git a/hw/s390x/Kconfig b/hw/s390x/Kconfig index 4c068d7960b973474b84d3cef9d4a3bf31ca4f9f..26ad1044858ce9ba3e78c65ffb9f295c31d7ab4c 100644 --- a/hw/s390x/Kconfig +++ b/hw/s390x/Kconfig @@ -6,6 +6,7 @@ config S390_CCW_VIRTIO imply VFIO_CCW imply WDT_DIAG288 imply PCIE_DEVICES + imply IOMMUFD select PCI_EXPRESS select S390_FLIC select S390_FLIC_KVM if KVM diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c index 893e71a41b2ddd88a007bd7d395895fc4d444540..69bf04e23ab01db6c0a5c37321d62069e1872ccb 100644 --- a/hw/s390x/sclp.c +++ b/hw/s390x/sclp.c @@ -21,13 +21,14 @@ #include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/ipl.h" #include "hw/s390x/cpu-topology.h" +#include "hw/s390x/s390-virtio-ccw.h" -static inline SCLPDevice *get_sclp_device(void) +static SCLPDevice *get_sclp_device(void) { static SCLPDevice *sclp; if (!sclp) { - sclp = SCLP(object_resolve_path_type("", TYPE_SCLP, NULL)); + sclp = S390_CCW_MACHINE(qdev_get_machine())->sclp; } return sclp; } diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index fc4b77fdb02707fa51585b511ed016ffb7315bdc..7b60ac11f550ba80fcdd78b5381c479b5b3e88b7 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -144,14 +144,10 @@ void scsi_bus_init_named(SCSIBus *bus, size_t bus_size, DeviceState *host, qbus_set_bus_hotplug_handler(BUS(bus)); } -static void scsi_dma_restart_bh(void *opaque) +void scsi_retry_requests(SCSIDevice *s) { - SCSIDevice *s = opaque; SCSIRequest *req, *next; - qemu_bh_delete(s->bh); - s->bh = NULL; - aio_context_acquire(blk_get_aio_context(s->conf.blk)); QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { scsi_req_ref(req); @@ -171,6 +167,17 @@ static void scsi_dma_restart_bh(void *opaque) scsi_req_unref(req); } aio_context_release(blk_get_aio_context(s->conf.blk)); +} + +static void scsi_dma_restart_bh(void *opaque) +{ + SCSIDevice *s = opaque; + + qemu_bh_delete(s->bh); + s->bh = NULL; + + scsi_retry_requests(s); + /* Drop the reference that was acquired in scsi_dma_restart_cb */ object_unref(OBJECT(s)); } diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 6691f5edb841b744a25a090d9c5f2d294a18ec5d..282bdc1ee0372cdd26a9def1b5bd8aa5a7b63d72 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -249,12 +249,16 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int ret, bool acct_failed) scsi_req_retry(&r->req); return true; + case BLOCK_ERROR_ACTION_RETRY: + scsi_req_retry(&r->req); + return true; + default: g_assert_not_reached(); } } -static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +static bool scsi_disk_req_handle_error(SCSIDiskReq *r, int ret, bool acct_failed) { if (r->req.io_canceled) { scsi_req_cancel_complete(&r->req); @@ -268,6 +272,18 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) return false; } +static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +{ + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + + if (r->req.io_canceled || ret < 0) { + return scsi_disk_req_handle_error(r, ret, acct_failed); + } + + blk_error_retry_reset_timeout(s->qdev.conf.blk); + return false; +} + static void scsi_aio_complete(void *opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; @@ -416,7 +432,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret) SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); assert (r->req.aiocb == NULL); - if (scsi_disk_req_check_error(r, ret, false)) { + if (scsi_disk_req_handle_error(r, ret, false)) { goto done; } @@ -457,6 +473,9 @@ static void scsi_do_read_cb(void *opaque, int ret) block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); } else { block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); + if (!r->req.io_canceled) { + blk_error_retry_reset_timeout(s->qdev.conf.blk); + } } scsi_do_read(opaque, ret); aio_context_release(blk_get_aio_context(s->qdev.conf.blk)); @@ -2021,7 +2040,10 @@ static int32_t scsi_disk_emulate_command(SCSIRequest *req, uint8_t *buf) memset(outbuf, 0, r->buflen); switch (req->cmd.buf[0]) { case TEST_UNIT_READY: - assert(blk_is_available(s->qdev.conf.blk)); + if (!blk_is_available(s->qdev.conf.blk)) { + scsi_check_condition(r, SENSE_CODE(NO_MEDIUM)); + return 0; + } break; case INQUIRY: buflen = scsi_disk_emulate_inquiry(req, outbuf); @@ -2391,6 +2413,13 @@ static void scsi_disk_resize_cb(void *opaque) } } +static void scsi_disk_retry_request(void *opaque) +{ + SCSIDiskState *s = opaque; + + scsi_retry_requests(&s->qdev); +} + static void scsi_cd_change_media_cb(void *opaque, bool load, Error **errp) { SCSIDiskState *s = opaque; @@ -2440,12 +2469,14 @@ static const BlockDevOps scsi_disk_removable_block_ops = { .is_medium_locked = scsi_cd_is_medium_locked, .is_tray_open = scsi_cd_is_tray_open, .resize_cb = scsi_disk_resize_cb, + .retry_request_cb = scsi_disk_retry_request, }; static const BlockDevOps scsi_disk_block_ops = { .drained_begin = scsi_disk_drained_begin, .drained_end = scsi_disk_drained_end, .resize_cb = scsi_disk_resize_cb, + .retry_request_cb = scsi_disk_retry_request, }; static void scsi_disk_unit_attention_reported(SCSIDevice *dev) @@ -3241,9 +3272,7 @@ static const TypeInfo scsi_cd_info = { #ifdef __linux__ static Property scsi_block_properties[] = { - DEFINE_BLOCK_ERROR_PROPERTIES(SCSIDiskState, qdev.conf), - DEFINE_PROP_DRIVE("drive", SCSIDiskState, qdev.conf.blk), - DEFINE_PROP_BOOL("share-rw", SCSIDiskState, qdev.conf.share_rw, false), + DEFINE_SCSI_DISK_PROPERTIES(), DEFINE_PROP_UINT16("rotation_rate", SCSIDiskState, rotation_rate, 0), DEFINE_PROP_UINT64("max_unmap_size", SCSIDiskState, max_unmap_size, DEFAULT_MAX_UNMAP_SIZE), diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 2417f0ad84799059a626d41099cd537bf4bdb836..12fdd8e748734ca84bb33d4234a11f7cef8a1498 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -192,6 +192,10 @@ static int scsi_handle_inquiry_reply(SCSIGenericReq *r, SCSIDevice *s, int len) (r->req.cmd.buf[1] & 0x01)) { page = r->req.cmd.buf[2]; if (page == 0xb0 && r->buflen >= 8) { + if (s->blocksize == 0) { + qemu_log("device blocksize is 0!\n"); + abort(); + } uint8_t buf[16] = {}; uint8_t buf_used = MIN(r->buflen, 16); uint64_t max_transfer = calculate_max_transfer(s); @@ -326,11 +330,23 @@ static void scsi_read_complete(void * opaque, int ret) /* Snoop READ CAPACITY output to set the blocksize. */ if (r->req.cmd.buf[0] == READ_CAPACITY_10 && (ldl_be_p(&r->buf[0]) != 0xffffffffU || s->max_lba == 0)) { - s->blocksize = ldl_be_p(&r->buf[4]); + int new_blocksize = ldl_be_p(&r->buf[4]); + if (s->blocksize != new_blocksize) { + qemu_log("device id=%s type=%d: blocksize %d change to %d\n", + s->qdev.id ? s->qdev.id : "null", s->type, + s->blocksize, new_blocksize); + } + s->blocksize = new_blocksize; s->max_lba = ldl_be_p(&r->buf[0]) & 0xffffffffULL; } else if (r->req.cmd.buf[0] == SERVICE_ACTION_IN_16 && (r->req.cmd.buf[1] & 31) == SAI_READ_CAPACITY_16) { - s->blocksize = ldl_be_p(&r->buf[8]); + int new_blocksize = ldl_be_p(&r->buf[8]); + if (s->blocksize != new_blocksize) { + qemu_log("device id=%s type=%d: blocksize %d change to %d\n", + s->qdev.id ? s->qdev.id : "null", s->type, + s->blocksize, new_blocksize); + } + s->blocksize = new_blocksize; s->max_lba = ldq_be_p(&r->buf[0]); } @@ -766,7 +782,6 @@ static void scsi_generic_realize(SCSIDevice *s, Error **errp) /* Only used by scsi-block, but initialize it nevertheless to be clean. */ s->default_scsi_version = -1; - s->io_timeout = DEFAULT_IO_TIMEOUT; scsi_generic_read_device_inquiry(s); } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 9c751bf29699697e40a24bc5149583f9380b7cee..bc7feb404a3a32dbaf8b365e5960d4843c002aaa 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -781,7 +781,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) req->req.cmd.tag, req->req.cmd.cdb[0]); d = virtio_scsi_device_get(s, req->req.cmd.lun); - if (!d) { + if (!d || !d->qdev.realized) { req->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; virtio_scsi_complete_cmd_req(req); return -ENOENT; diff --git a/hw/sd/sdhci-pci.c b/hw/sd/sdhci-pci.c index 9b7bee8b3fbf21a31646e37c28f288b494d06734..c1eb67cf29257446a398b28c8ad43f1f367da77e 100644 --- a/hw/sd/sdhci-pci.c +++ b/hw/sd/sdhci-pci.c @@ -18,6 +18,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/module.h" +#include "hw/irq.h" #include "hw/qdev-properties.h" #include "hw/sd/sdhci.h" #include "sdhci-internal.h" @@ -49,6 +50,7 @@ static void sdhci_pci_exit(PCIDevice *dev) { SDHCIState *s = PCI_SDHCI(dev); + qemu_free_irq(s->irq); sdhci_common_unrealize(s); sdhci_uninitfn(s); } diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 40473b0db099a6289181ffe2795e19ccbe532ae9..e95ea34895dbf47a312ad16b7be68af41c84ae19 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -473,6 +473,7 @@ static uint32_t sdhci_read_dataport(SDHCIState *s, unsigned size) } for (i = 0; i < size; i++) { + assert(s->data_count < s->buf_maxsz); value |= s->fifo_buffer[s->data_count] << i * 8; s->data_count++; /* check if we've read all valid data (blksize bytes) from buffer */ @@ -561,6 +562,7 @@ static void sdhci_write_dataport(SDHCIState *s, uint32_t value, unsigned size) } for (i = 0; i < size; i++) { + assert(s->data_count < s->buf_maxsz); s->fifo_buffer[s->data_count] = value & 0xFF; s->data_count++; value >>= 8; @@ -1208,6 +1210,12 @@ sdhci_write(void *opaque, hwaddr offset, uint64_t val, unsigned size) if (!(s->capareg & R_SDHC_CAPAB_SDMA_MASK)) { value &= ~SDHC_TRNS_DMA; } + + /* TRNMOD writes are inhibited while Command Inhibit (DAT) is true */ + if (s->prnsts & SDHC_DATA_INHIBIT) { + mask |= 0xffff; + } + MASKED_WRITE(s->trnmod, mask, value & SDHC_TRNMOD_MASK); MASKED_WRITE(s->cmdreg, mask >> 16, value >> 16); diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 2a90601ac5d93ca1cf2e07d5a722b614939ab71d..c0c5a81e664bbfa70be6e6b965a3d6ef6530186c 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -102,6 +102,7 @@ static struct { #define DEFAULT_CPU_SPEED 2000 static struct { + uint16_t processor_family; const char *sock_pfx, *manufacturer, *version, *serial, *asset, *part; uint64_t max_speed; uint64_t current_speed; @@ -110,6 +111,7 @@ static struct { .max_speed = DEFAULT_CPU_SPEED, .current_speed = DEFAULT_CPU_SPEED, .processor_id = 0, + .processor_family = 0x01, /* Other */ }; struct type8_instance { @@ -337,6 +339,10 @@ static const QemuOptDesc qemu_smbios_type4_opts[] = { .name = "part", .type = QEMU_OPT_STRING, .help = "part number", + }, { + .name = "processor-family", + .type = QEMU_OPT_NUMBER, + .help = "processor family", }, { .name = "processor-id", .type = QEMU_OPT_NUMBER, @@ -726,7 +732,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) snprintf(sock_str, sizeof(sock_str), "%s%2x", type4.sock_pfx, instance); SMBIOS_TABLE_SET_STR(4, socket_designation_str, sock_str); t->processor_type = 0x03; /* CPU */ - t->processor_family = 0x01; /* Other */ + t->processor_family = 0xfe; /* use Processor Family 2 field */ SMBIOS_TABLE_SET_STR(4, processor_manufacturer_str, type4.manufacturer); if (type4.processor_id == 0) { t->processor_id[0] = cpu_to_le32(smbios_cpuid_version); @@ -758,7 +764,7 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) t->thread_count = (threads_per_socket > 255) ? 0xFF : threads_per_socket; t->processor_characteristics = cpu_to_le16(0x02); /* Unknown */ - t->processor_family2 = cpu_to_le16(0x01); /* Other */ + t->processor_family2 = cpu_to_le16(type4.processor_family); if (tbl_len == SMBIOS_TYPE_4_LEN_V30) { t->core_count2 = t->core_enabled2 = cpu_to_le16(cores_per_socket); @@ -983,6 +989,13 @@ void smbios_set_cpuid(uint32_t version, uint32_t features) field = value; \ } +void smbios_set_default_processor_family(uint16_t processor_family) +{ + if (type4.processor_family <= 0x01) { + type4.processor_family = processor_family; + } +} + void smbios_set_defaults(const char *manufacturer, const char *product, const char *version, bool legacy_mode, bool uuid_encoded, SmbiosEntryPointType ep_type) @@ -1402,6 +1415,9 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) return; } save_opt(&type4.sock_pfx, opts, "sock_pfx"); + type4.processor_family = qemu_opt_get_number(opts, + "processor-family", + 0x01 /* Other */); save_opt(&type4.manufacturer, opts, "manufacturer"); save_opt(&type4.version, opts, "version"); save_opt(&type4.serial, opts, "serial"); diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c index 2a4001b774a2f5c66a6c794763e99276b2bb1d94..8af919a970401566080a8567786faf2a821ef6e1 100644 --- a/hw/ssi/aspeed_smc.c +++ b/hw/ssi/aspeed_smc.c @@ -764,8 +764,7 @@ static uint8_t aspeed_smc_hclk_divisor(uint8_t hclk_mask) } } - aspeed_smc_error("invalid HCLK mask %x", hclk_mask); - return 0; + g_assert_not_reached(); } /* diff --git a/hw/timer/exynos4210_mct.c b/hw/timer/exynos4210_mct.c index 446bbd2b96cf3c65d8c15b1ddb13c39f2937065f..6f47bfe2c29b8d874e17a95756886dc21e5f23d6 100644 --- a/hw/timer/exynos4210_mct.c +++ b/hw/timer/exynos4210_mct.c @@ -815,7 +815,7 @@ static uint32_t exynos4210_ltick_cnt_get_cnto(struct tick_timer *s) /* Both are counting */ icnto = remain / s->tcntb; if (icnto) { - tcnto = remain % (icnto * s->tcntb); + tcnto = remain % ((uint64_t)icnto * s->tcntb); } else { tcnto = remain % s->tcntb; } diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c index eccdb852a0a7ecefe3b7cd8c5a2919572a326272..f57d33e77169fb197e3afbd58110fd28ea8fe3e5 100644 --- a/hw/ufs/ufs.c +++ b/hw/ufs/ufs.c @@ -25,6 +25,7 @@ #include "qapi/error.h" #include "migration/vmstate.h" #include "scsi/constants.h" +#include "hw/irq.h" #include "trace.h" #include "ufs.h" @@ -126,6 +127,10 @@ static MemTxResult ufs_dma_read_req_upiu(UfsRequest *req) copy_size = sizeof(UtpUpiuHeader) + UFS_TRANSACTION_SPECIFIC_FIELD_SIZE + data_segment_length; + if (copy_size > sizeof(req->req_upiu)) { + copy_size = sizeof(req->req_upiu); + } + ret = ufs_addr_read(u, req_upiu_base_addr, &req->req_upiu, copy_size); if (ret) { trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr); @@ -225,6 +230,10 @@ static MemTxResult ufs_dma_write_rsp_upiu(UfsRequest *req) copy_size = rsp_upiu_byte_len; } + if (copy_size > sizeof(req->rsp_upiu)) { + copy_size = sizeof(req->rsp_upiu); + } + ret = ufs_addr_write(u, rsp_upiu_base_addr, &req->rsp_upiu, copy_size); if (ret) { trace_ufs_err_dma_write_rsp_upiu(req->slot, rsp_upiu_base_addr); @@ -447,6 +456,14 @@ void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags, req->rsp_upiu.header.data_segment_length = cpu_to_be16(data_segment_length); } +void ufs_build_query_response(UfsRequest *req) +{ + req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode; + req->rsp_upiu.qr.idn = req->req_upiu.qr.idn; + req->rsp_upiu.qr.index = req->req_upiu.qr.index; + req->rsp_upiu.qr.selector = req->req_upiu.qr.selector; +} + static UfsReqResult ufs_exec_scsi_cmd(UfsRequest *req) { UfsHc *u = req->hc; @@ -923,10 +940,6 @@ static QueryRespCode ufs_read_desc(UfsRequest *req) if (length > req->rsp_upiu.qr.data[0]) { length = req->rsp_upiu.qr.data[0]; } - req->rsp_upiu.qr.opcode = req->req_upiu.qr.opcode; - req->rsp_upiu.qr.idn = req->req_upiu.qr.idn; - req->rsp_upiu.qr.index = req->req_upiu.qr.index; - req->rsp_upiu.qr.selector = req->req_upiu.qr.selector; req->rsp_upiu.qr.length = cpu_to_be16(length); return status; @@ -1007,6 +1020,7 @@ static UfsReqResult ufs_exec_query_cmd(UfsRequest *req) data_segment_length = be16_to_cpu(req->rsp_upiu.qr.length); ufs_build_upiu_header(req, UFS_UPIU_TRANSACTION_QUERY_RSP, 0, status, 0, data_segment_length); + ufs_build_query_response(req); if (status != UFS_QUERY_RESULT_SUCCESS) { return UFS_REQUEST_FAIL; @@ -1273,6 +1287,8 @@ static void ufs_exit(PCIDevice *pci_dev) { UfsHc *u = UFS(pci_dev); + qemu_free_irq(u->irq); + qemu_bh_delete(u->doorbell_bh); qemu_bh_delete(u->complete_bh); diff --git a/hw/ufs/ufs.h b/hw/ufs/ufs.h index 8fda94f4efa9e4d2d8a5b8041528195eb74f8378..8a74b4c2ab7280f8128fc0263902a24708c0a1e6 100644 --- a/hw/ufs/ufs.h +++ b/hw/ufs/ufs.h @@ -132,6 +132,7 @@ static inline bool is_wlun(uint8_t lun) void ufs_build_upiu_header(UfsRequest *req, uint8_t trans_type, uint8_t flags, uint8_t response, uint8_t scsi_status, uint16_t data_segment_length); +void ufs_build_query_response(UfsRequest *req); void ufs_complete_req(UfsRequest *req, UfsReqResult req_result); void ufs_init_wlu(UfsLu *wlu, uint8_t wlun); #endif /* HW_UFS_UFS_H */ diff --git a/hw/usb/bus.c b/hw/usb/bus.c index 92d6ed5626141f8622c55bedb9dc23326b3338c8..20cd9b6e6fd9e70689bfe756ff44a2ff8a013fca 100644 --- a/hw/usb/bus.c +++ b/hw/usb/bus.c @@ -536,6 +536,10 @@ void usb_check_attach(USBDevice *dev, Error **errp) bus->qbus.name, port->path, portspeed); return; } + + qemu_log("attach usb device \"%s\" (%s speed) to VM bus \"%s\", " + "port \"%s\" (%s speed)\n", dev->product_desc, devspeed, + bus->qbus.name, port->path, portspeed); } void usb_device_attach(USBDevice *dev, Error **errp) @@ -564,6 +568,8 @@ int usb_device_detach(USBDevice *dev) usb_detach(port); dev->attached = false; + qemu_log("detach usb device \"%s\" from VM bus \"%s\", port \"%s\"\n", + dev->product_desc, bus->qbus.name, port->path); return 0; } diff --git a/hw/usb/core.c b/hw/usb/core.c index 975f76250a1a34b79252975b13724368577240f4..51b36126cab25b5c9854f157b09ce14c233b2e51 100644 --- a/hw/usb/core.c +++ b/hw/usb/core.c @@ -87,7 +87,7 @@ void usb_device_reset(USBDevice *dev) return; } usb_device_handle_reset(dev); - dev->remote_wakeup = 0; + dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP; dev->addr = 0; dev->state = USB_STATE_DEFAULT; } @@ -105,7 +105,8 @@ void usb_wakeup(USBEndpoint *ep, unsigned int stream) */ return; } - if (dev->remote_wakeup && dev->port && dev->port->ops->wakeup) { + if ((dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) + && dev->port && dev->port->ops->wakeup) { dev->port->ops->wakeup(dev->port); } if (bus->ops->wakeup_endpoint) { diff --git a/hw/usb/desc.c b/hw/usb/desc.c index f2bdc05a95394c6c5f942dd2aad67dc124ede069..333f73fff150f0a5fd1d015eac1bb1943286f6a8 100644 --- a/hw/usb/desc.c +++ b/hw/usb/desc.c @@ -752,7 +752,7 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p, if (config->bmAttributes & USB_CFG_ATT_SELFPOWER) { data[0] |= 1 << USB_DEVICE_SELF_POWERED; } - if (dev->remote_wakeup) { + if (dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) { data[0] |= 1 << USB_DEVICE_REMOTE_WAKEUP; } data[1] = 0x00; @@ -762,14 +762,15 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p, } case DeviceOutRequest | USB_REQ_CLEAR_FEATURE: if (value == USB_DEVICE_REMOTE_WAKEUP) { - dev->remote_wakeup = 0; + dev->remote_wakeup &= ~USB_DEVICE_REMOTE_WAKEUP; ret = 0; } trace_usb_clear_device_feature(dev->addr, value, ret); break; case DeviceOutRequest | USB_REQ_SET_FEATURE: + dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED; if (value == USB_DEVICE_REMOTE_WAKEUP) { - dev->remote_wakeup = 1; + dev->remote_wakeup |= USB_DEVICE_REMOTE_WAKEUP; ret = 0; } trace_usb_set_device_feature(dev->addr, value, ret); diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c index bdd6d1ffafe432b75a5321305c5d0193e3e27bb1..cc68d1ce9eec463610429d5802ae5c23a6b74a5d 100644 --- a/hw/usb/dev-hid.c +++ b/hw/usb/dev-hid.c @@ -745,7 +745,7 @@ static int usb_ptr_post_load(void *opaque, int version_id) { USBHIDState *s = opaque; - if (s->dev.remote_wakeup) { + if (s->dev.remote_wakeup & USB_DEVICE_REMOTE_WAKEUP) { hid_pointer_activate(&s->hid); } return 0; diff --git a/hw/usb/dev-hub.c b/hw/usb/dev-hub.c index 5703e0e826ec6484c5b7e92b248656fd382414eb..7b3cfa2c1b5de9de73230c9348f98f6a3cd1f8a6 100644 --- a/hw/usb/dev-hub.c +++ b/hw/usb/dev-hub.c @@ -479,6 +479,7 @@ static void usb_hub_handle_control(USBDevice *dev, USBPacket *p, usb_hub_port_clear(port, PORT_STAT_SUSPEND); port->wPortChange = 0; } + break; default: goto fail; } diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c index 19b4534c20c10d78d5e3d1fdca7aefc168e9265a..fa8c7af5c81d00cc783fa9d8b9c66df1efb688a2 100644 --- a/hw/usb/hcd-ehci.c +++ b/hw/usb/hcd-ehci.c @@ -1086,8 +1086,9 @@ static void ehci_opreg_write(void *ptr, hwaddr addr, case CONFIGFLAG: val &= 0x1; if (val) { - for(i = 0; i < NB_PORTS; i++) + for (i = 0; i < NB_PORTS; i++) { handle_port_owner_write(s, i, 0); + } } break; @@ -2286,7 +2287,8 @@ static void ehci_work_bh(void *opaque) ehci_update_frindex(ehci, skipped_uframes); ehci->last_run_ns += UFRAME_TIMER_NS * skipped_uframes; uframes -= skipped_uframes; - DPRINTF("WARNING - EHCI skipped %d uframes\n", skipped_uframes); + DPRINTF("WARNING - EHCI skipped %"PRIu64" uframes\n", + skipped_uframes); } for (i = 0; i < uframes; i++) { diff --git a/hw/usb/hcd-uhci.c b/hw/usb/hcd-uhci.c index 77baaa7a6b1099927c420decfc956e6f74429349..a92581ff5f883632d7a167a8e183ba3d1d5943d4 100644 --- a/hw/usb/hcd-uhci.c +++ b/hw/usb/hcd-uhci.c @@ -44,6 +44,8 @@ #include "hcd-uhci.h" #define FRAME_TIMER_FREQ 1000 +#define FRAME_TIMER_FREQ_LAZY 10 +#define USB_DEVICE_NEED_NORMAL_FREQ "QEMU USB Tablet" #define FRAME_MAX_LOOPS 256 @@ -109,6 +111,22 @@ static void uhci_async_cancel(UHCIAsync *async); static void uhci_queue_fill(UHCIQueue *q, UHCI_TD *td); static void uhci_resume(void *opaque); +static int64_t uhci_frame_timer_freq = FRAME_TIMER_FREQ_LAZY; + +static void uhci_set_frame_freq(int freq) +{ + if (freq <= 0) { + return; + } + + uhci_frame_timer_freq = freq; +} + +static qemu_usb_controller qemu_uhci = { + .name = "uhci", + .qemu_set_freq = uhci_set_frame_freq, +}; + static inline int32_t uhci_queue_token(UHCI_TD *td) { if ((td->token & (0xf << 15)) == 0) { @@ -351,7 +369,7 @@ static int uhci_post_load(void *opaque, int version_id) if (version_id < 2) { s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ); + (NANOSECONDS_PER_SECOND / uhci_frame_timer_freq); } return 0; } @@ -392,8 +410,29 @@ static void uhci_port_write(void *opaque, hwaddr addr, if ((val & UHCI_CMD_RS) && !(s->cmd & UHCI_CMD_RS)) { /* start frame processing */ trace_usb_uhci_schedule_start(); - s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - (NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ); + + /* + * If the frequency of frame_timer is too slow, Guest OS (Win2012) would become + * blue-screen after hotplugging some vcpus. + * If this USB device support the remote-wakeup, the UHCI controller + * will enter global suspend mode when there is no input for several seconds. + * In this case, Qemu will delete the frame_timer. Since the frame_timer has been deleted, + * there is no influence to the performance of Vms. So, we can change the frequency to 1000. + * After that the frequency will be safe when we trigger the frame_timer again. + * Excepting this, there are two ways to change the frequency: + * 1)VNC connect/disconnect;2)attach/detach USB device. + */ + if ((uhci_frame_timer_freq != FRAME_TIMER_FREQ) + && (s->ports[0].port.dev) + && (!memcmp(s->ports[0].port.dev->product_desc, + USB_DEVICE_NEED_NORMAL_FREQ, strlen(USB_DEVICE_NEED_NORMAL_FREQ))) + && (s->ports[0].port.dev->remote_wakeup & USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED)) { + qemu_log("turn up the frequency of UHCI controller to %d\n", FRAME_TIMER_FREQ); + uhci_frame_timer_freq = FRAME_TIMER_FREQ; + } + + s->frame_time = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ; + s->expire_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->frame_time; timer_mod(s->frame_timer, s->expire_time); s->status &= ~UHCI_STS_HCHALTED; } else if (!(val & UHCI_CMD_RS)) { @@ -457,8 +496,9 @@ static void uhci_port_write(void *opaque, hwaddr addr, int n; n = (addr >> 1) & 7; - if (n >= NB_PORTS) + if (n >= NB_PORTS) { return; + } port = &s->ports[n]; dev = port->port.dev; if (dev && dev->attached) { @@ -513,8 +553,9 @@ static uint64_t uhci_port_read(void *opaque, hwaddr addr, unsigned size) UHCIPort *port; int n; n = (addr >> 1) & 7; - if (n >= NB_PORTS) + if (n >= NB_PORTS) { goto read_default; + } port = &s->ports[n]; val = port->ctrl; } @@ -1081,7 +1122,6 @@ static void uhci_frame_timer(void *opaque) UHCIState *s = opaque; uint64_t t_now, t_last_run; int i, frames; - const uint64_t frame_t = NANOSECONDS_PER_SECOND / FRAME_TIMER_FREQ; s->completions_only = false; qemu_bh_cancel(s->bh); @@ -1097,14 +1137,14 @@ static void uhci_frame_timer(void *opaque) } /* We still store expire_time in our state, for migration */ - t_last_run = s->expire_time - frame_t; + t_last_run = s->expire_time - s->frame_time; t_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); /* Process up to MAX_FRAMES_PER_TICK frames */ - frames = (t_now - t_last_run) / frame_t; + frames = (t_now - t_last_run) / s->frame_time; if (frames > s->maxframes) { int skipped = frames - s->maxframes; - s->expire_time += skipped * frame_t; + s->expire_time += skipped * s->frame_time; s->frnum = (s->frnum + skipped) & 0x7ff; frames -= skipped; } @@ -1121,7 +1161,7 @@ static void uhci_frame_timer(void *opaque) /* The spec says frnum is the frame currently being processed, and * the guest must look at frnum - 1 on interrupt, so inc frnum now */ s->frnum = (s->frnum + 1) & 0x7ff; - s->expire_time += frame_t; + s->expire_time += s->frame_time; } /* Complete the previous frame(s) */ @@ -1132,7 +1172,12 @@ static void uhci_frame_timer(void *opaque) } s->pending_int_mask = 0; - timer_mod(s->frame_timer, t_now + frame_t); + /* expire_time is calculated from last frame_time, we should calculate it + * according to new frame_time which equals to + * NANOSECONDS_PER_SECOND / uhci_frame_timer_freq */ + s->expire_time -= s->frame_time - NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; + s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; + timer_mod(s->frame_timer, t_now + s->frame_time); } static const MemoryRegionOps uhci_ioport_ops = { @@ -1193,8 +1238,10 @@ void usb_uhci_common_realize(PCIDevice *dev, Error **errp) s->bh = qemu_bh_new_guarded(uhci_bh, s, &DEVICE(dev)->mem_reentrancy_guard); s->frame_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, uhci_frame_timer, s); s->num_ports_vmstate = NB_PORTS; + s->frame_time = NANOSECONDS_PER_SECOND / uhci_frame_timer_freq; QTAILQ_INIT(&s->queues); + qemu_register_usb_controller(&qemu_uhci, QEMU_USB_CONTROLLER_UHCI); memory_region_init_io(&s->io_bar, OBJECT(s), &uhci_ioport_ops, s, "uhci", 0x20); diff --git a/hw/usb/hcd-uhci.h b/hw/usb/hcd-uhci.h index 69f8b40c49c2febf063a0f3ee3e5fd677e3f7a9a..091871991133c3299db55e05089175c1dc7cecdc 100644 --- a/hw/usb/hcd-uhci.h +++ b/hw/usb/hcd-uhci.h @@ -50,6 +50,7 @@ typedef struct UHCIState { uint16_t status; uint16_t intr; /* interrupt enable register */ uint16_t frnum; /* frame number */ + uint64_t frame_time; /* frame time in ns */ uint32_t fl_base_addr; /* frame list base address */ uint8_t sof_timing; uint8_t status2; /* bit 0 and 1 are used to generate UHCI_STS_USBINT */ diff --git a/hw/usb/host-libusb.c b/hw/usb/host-libusb.c index d7060a42d57e072303e509fde25eae91318847e6..11a246ac7223db529cda44320f6118f62c9048b9 100644 --- a/hw/usb/host-libusb.c +++ b/hw/usb/host-libusb.c @@ -945,6 +945,30 @@ static void usb_host_ep_update(USBHostDevice *s) libusb_free_config_descriptor(conf); } +static unsigned int usb_get_controller_type(int speed) +{ + unsigned int type = MAX_USB_CONTROLLER_TYPES; + + switch (speed) { + case USB_SPEED_SUPER: + type = QEMU_USB_CONTROLLER_XHCI; + break; + case USB_SPEED_HIGH: + type = QEMU_USB_CONTROLLER_EHCI; + break; + case USB_SPEED_FULL: + type = QEMU_USB_CONTROLLER_UHCI; + break; + case USB_SPEED_LOW: + type = QEMU_USB_CONTROLLER_OHCI; + break; + default: + break; + } + + return type; +} + static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) { USBDevice *udev = USB_DEVICE(s); @@ -968,6 +992,8 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) rc = libusb_open(dev, &s->dh); if (rc != 0) { + qemu_log("libusb open usb device bus %d, device %d failed\n", + bus_num, addr); goto fail; } } else { @@ -995,6 +1021,7 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) libusb_get_device_descriptor(dev, &s->ddesc); usb_host_get_port(s->dev, s->port, sizeof(s->port)); + qemu_log("open a host usb device on bus %d, device %d\n", bus_num, addr); usb_ep_init(udev); usb_host_ep_update(s); @@ -1054,6 +1081,12 @@ static int usb_host_open(USBHostDevice *s, libusb_device *dev, int hostfd) } trace_usb_host_open_success(bus_num, addr); + + /* change ehci frame time freq when USB passthrough */ + qemu_log("usb host speed is %d\n", udev->speed); + qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE, + usb_get_controller_type(udev->speed)); + return 0; fail: @@ -1116,6 +1149,8 @@ static int usb_host_close(USBHostDevice *s) usb_device_detach(udev); } + qemu_log("begin to reset the usb device, bus : %d, device : %d\n", + s->bus_num, s->addr); usb_host_release_interfaces(s); libusb_reset_device(s->dh); usb_host_attach_kernel(s); @@ -1129,6 +1164,8 @@ static int usb_host_close(USBHostDevice *s) } usb_host_auto_check(NULL); + qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE, + usb_get_controller_type(udev->speed)); return 0; } diff --git a/hw/vfio/Kconfig b/hw/vfio/Kconfig index 7cdba0560aa821c88d3420b36f86020575834202..5f0d3c2d2bfae4acbc76b91bca73ec02d0184e55 100644 --- a/hw/vfio/Kconfig +++ b/hw/vfio/Kconfig @@ -41,3 +41,9 @@ config VFIO_IGD bool default y if PC_PCI depends on VFIO_PCI + +config VFIO_HCT + bool + default y + select VFIO + depends on LINUX && PCI diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index bbf69ff55ae8a922d0ab6d6f966a9a2283cdd2a3..6b2bc325493fbb3a33fc48a11b3869686f8dfaaf 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -11,10 +11,12 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/ap-device.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" @@ -158,18 +160,9 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = &vapdev->vdev; - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - vbasedev->ops = &vfio_ap_ops; - vbasedev->type = VFIO_DEVICE_TYPE_AP; - vbasedev->dev = dev; - - /* - * vfio-ap devices operate in a way compatible with discarding of - * memory in RAM blocks, as no pages are pinned in the host. - * This needs to be set before vfio_get_device() for vfio common to - * handle ram_block_discard_disable(). - */ - vapdev->vdev.ram_block_discard_allowed = true; + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } ret = vfio_attach_device(vbasedev->name, vbasedev, &address_space_memory, errp); @@ -204,6 +197,10 @@ static void vfio_ap_unrealize(DeviceState *dev) static Property vfio_ap_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOAPDevice, vdev.sysfsdev), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOAPDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; @@ -224,11 +221,39 @@ static const VMStateDescription vfio_ap_vmstate = { .unmigratable = 1, }; +static void vfio_ap_instance_init(Object *obj) +{ + VFIOAPDevice *vapdev = VFIO_AP_DEVICE(obj); + VFIODevice *vbasedev = &vapdev->vdev; + + /* + * vfio-ap devices operate in a way compatible with discarding of + * memory in RAM blocks, as no pages are pinned in the host. + * This needs to be set before vfio_get_device() for vfio common to + * handle ram_block_discard_disable(). + */ + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, + DEVICE(vapdev), true); + + /* AP device is mdev type device */ + vbasedev->mdev = true; +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ap_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_AP_DEVICE(obj)->vdev, str, errp); +} +#endif + static void vfio_ap_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ap_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ap_set_fd); +#endif dc->vmsd = &vfio_ap_vmstate; dc->desc = "VFIO-based AP device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -243,6 +268,7 @@ static const TypeInfo vfio_ap_info = { .name = TYPE_VFIO_AP_DEVICE, .parent = TYPE_AP_DEVICE, .instance_size = sizeof(VFIOAPDevice), + .instance_init = vfio_ap_instance_init, .class_init = vfio_ap_class_init, }; diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index d857bb8d0fe4ff71b3cf635d6da43eccbe593d5f..257e9723cf58ab852d039178c43b4f2456765aeb 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -15,12 +15,14 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include #include #include #include "qapi/error.h" #include "hw/vfio/vfio-common.h" +#include "sysemu/iommufd.h" #include "hw/s390x/s390-ccw.h" #include "hw/s390x/vfio-ccw.h" #include "hw/qdev-properties.h" @@ -588,22 +590,9 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) } } - vbasedev->ops = &vfio_ccw_ops; - vbasedev->type = VFIO_DEVICE_TYPE_CCW; - vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid, - vcdev->cdev.hostid.ssid, - vcdev->cdev.hostid.devid); - vbasedev->dev = dev; - - /* - * All vfio-ccw devices are believed to operate in a way compatible with - * discarding of memory in RAM blocks, ie. pages pinned in the host are - * in the current working set of the guest driver and therefore never - * overlap e.g., with pages available to the guest balloon driver. This - * needs to be set before vfio_get_device() for vfio common to handle - * ram_block_discard_disable(). - */ - vbasedev->ram_block_discard_allowed = true; + if (vfio_device_get_name(vbasedev, errp) < 0) { + return; + } ret = vfio_attach_device(cdev->mdevid, vbasedev, &address_space_memory, errp); @@ -677,6 +666,10 @@ static void vfio_ccw_unrealize(DeviceState *dev) static Property vfio_ccw_properties[] = { DEFINE_PROP_STRING("sysfsdev", VFIOCCWDevice, vdev.sysfsdev), DEFINE_PROP_BOOL("force-orb-pfch", VFIOCCWDevice, force_orb_pfch, false), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOCCWDevice, vdev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; @@ -685,12 +678,42 @@ static const VMStateDescription vfio_ccw_vmstate = { .unmigratable = 1, }; +static void vfio_ccw_instance_init(Object *obj) +{ + VFIOCCWDevice *vcdev = VFIO_CCW(obj); + VFIODevice *vbasedev = &vcdev->vdev; + + /* CCW device is mdev type device */ + vbasedev->mdev = true; + + /* + * All vfio-ccw devices are believed to operate in a way compatible with + * discarding of memory in RAM blocks, ie. pages pinned in the host are + * in the current working set of the guest driver and therefore never + * overlap e.g., with pages available to the guest balloon driver. This + * needs to be set before vfio_get_device() for vfio common to handle + * ram_block_discard_disable(). + */ + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, + DEVICE(vcdev), true); +} + +#ifdef CONFIG_IOMMUFD +static void vfio_ccw_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_CCW(obj)->vdev, str, errp); +} +#endif + static void vfio_ccw_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); S390CCWDeviceClass *cdc = S390_CCW_DEVICE_CLASS(klass); device_class_set_props(dc, vfio_ccw_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_ccw_set_fd); +#endif dc->vmsd = &vfio_ccw_vmstate; dc->desc = "VFIO-based subchannel assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); @@ -708,6 +731,7 @@ static const TypeInfo vfio_ccw_info = { .name = TYPE_VFIO_CCW, .parent = TYPE_S390_CCW, .instance_size = sizeof(VFIOCCWDevice), + .instance_init = vfio_ccw_instance_init, .class_init = vfio_ccw_class_init, }; diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e70fdf5e0cacf756fb8dcce2bfd906c65ae71684..0be63c5fbc574b040ed797cb6411243807f1e537 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -145,7 +145,7 @@ void vfio_unblock_multiple_devices_migration(void) bool vfio_viommu_preset(VFIODevice *vbasedev) { - return vbasedev->container->space->as != &address_space_memory; + return vbasedev->bcontainer->space->as != &address_space_memory; } static void vfio_set_migration_error(int err) @@ -177,7 +177,7 @@ bool vfio_device_state_is_precopy(VFIODevice *vbasedev) migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; } -static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) +static bool vfio_devices_all_dirty_tracking(VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); @@ -187,7 +187,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -203,11 +203,14 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) return true; } -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) +bool vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) { + return false; + } if (!vbasedev->dirty_pages_supported) { return false; } @@ -220,7 +223,8 @@ bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container) * Check if all VFIO devices are running and migration is active, which is * essentially equivalent to the migration being in pre-copy phase. */ -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer) { VFIODevice *vbasedev; @@ -228,7 +232,7 @@ bool vfio_devices_all_running_and_mig_active(VFIOContainer *container) return false; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { VFIOMigration *migration = vbasedev->migration; if (!migration) { @@ -292,7 +296,7 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) { VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; void *vaddr; int ret; @@ -322,21 +326,22 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) * of vaddr will always be there, even if the memory object is * destroyed and its backing memory munmap-ed. */ - ret = vfio_dma_map(container, iova, - iotlb->addr_mask + 1, vaddr, - read_only); + ret = vfio_container_dma_map(bcontainer, iova, + iotlb->addr_mask + 1, vaddr, + read_only); if (ret) { - error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, vaddr, ret, strerror(-ret)); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); + ret = vfio_container_dma_unmap(bcontainer, iova, + iotlb->addr_mask + 1, iotlb); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); vfio_set_migration_error(ret); } @@ -350,14 +355,15 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr size = int128_get64(section->size); const hwaddr iova = section->offset_within_address_space; int ret; /* Unmap with a single call. */ - ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); if (ret) { - error_report("%s: vfio_dma_unmap() failed: %s", __func__, + error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); } } @@ -367,6 +373,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, { VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, listener); + VFIOContainerBase *bcontainer = vrdl->bcontainer; const hwaddr end = section->offset_within_region + int128_get64(section->size); hwaddr start, next, iova; @@ -385,8 +392,8 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, section->offset_within_address_space; vaddr = memory_region_get_ram_ptr(section->mr) + start; - ret = vfio_dma_map(vrdl->container, iova, next - start, - vaddr, section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, next - start, + vaddr, section->readonly); if (ret) { /* Rollback */ vfio_ram_discard_notify_discard(rdl, section); @@ -396,7 +403,7 @@ static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, return 0; } -static void vfio_register_ram_discard_listener(VFIOContainer *container, +static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); @@ -409,7 +416,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); vrdl = g_new0(VFIORamDiscardListener, 1); - vrdl->container = container; + vrdl->bcontainer = bcontainer; vrdl->mr = section->mr; vrdl->offset_within_address_space = section->offset_within_address_space; vrdl->size = int128_get64(section->size); @@ -417,14 +424,14 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, section->mr); g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); - g_assert(container->pgsizes && - vrdl->granularity >= 1ULL << ctz64(container->pgsizes)); + g_assert(bcontainer->pgsizes && + vrdl->granularity >= 1ULL << ctz64(bcontainer->pgsizes)); ram_discard_listener_init(&vrdl->listener, vfio_ram_discard_notify_populate, vfio_ram_discard_notify_discard, true); ram_discard_manager_register_listener(rdm, &vrdl->listener, section); - QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + QLIST_INSERT_HEAD(&bcontainer->vrdl_list, vrdl, next); /* * Sanity-check if we have a theoretically problematic setup where we could @@ -439,7 +446,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, * number of sections in the address space we could have over time, * also consuming DMA mappings. */ - if (container->dma_max_mappings) { + if (bcontainer->dma_max_mappings) { unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; #ifdef CONFIG_KVM @@ -448,7 +455,7 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } #endif - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { hwaddr start, end; start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, @@ -460,23 +467,23 @@ static void vfio_register_ram_discard_listener(VFIOContainer *container, } if (vrdl_mappings + max_memslots - vrdl_count > - container->dma_max_mappings) { + bcontainer->dma_max_mappings) { warn_report("%s: possibly running out of DMA mappings. E.g., try" " increasing the 'block-size' of virtio-mem devies." " Maximum possible DMA mappings: %d, Maximum possible" - " memslots: %d", __func__, container->dma_max_mappings, + " memslots: %d", __func__, bcontainer->dma_max_mappings, max_memslots); } } } -static void vfio_unregister_ram_discard_listener(VFIOContainer *container, +static void vfio_unregister_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -538,7 +545,7 @@ static bool vfio_listener_valid_section(MemoryRegionSection *section, return true; } -static bool vfio_get_section_iova_range(VFIOContainer *container, +static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, MemoryRegionSection *section, hwaddr *out_iova, hwaddr *out_end, Int128 *out_llend) @@ -566,7 +573,8 @@ static bool vfio_get_section_iova_range(VFIOContainer *container, static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -577,7 +585,8 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { if (memory_region_is_ram_device(section->mr)) { trace_vfio_listener_region_add_no_dma_map( memory_region_name(section->mr), @@ -588,7 +597,7 @@ static void vfio_listener_region_add(MemoryListener *listener, return; } - if (vfio_container_add_section_window(container, section, &err)) { + if (vfio_container_add_section_window(bcontainer, section, &err)) { goto fail; } @@ -610,7 +619,7 @@ static void vfio_listener_region_add(MemoryListener *listener, giommu->iommu_mr = iommu_mr; giommu->iommu_offset = section->offset_within_address_space - section->offset_within_region; - giommu->container = container; + giommu->bcontainer = bcontainer; llend = int128_add(int128_make64(section->offset_within_region), section->size); llend = int128_sub(llend, int128_one()); @@ -623,16 +632,17 @@ static void vfio_listener_region_add(MemoryListener *listener, iommu_idx); ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr, - container->pgsizes, + bcontainer->pgsizes, &err); if (ret) { g_free(giommu); goto fail; } - if (container->iova_ranges) { + if (bcontainer->iova_ranges) { ret = memory_region_iommu_set_iova_ranges(giommu->iommu_mr, - container->iova_ranges, &err); + bcontainer->iova_ranges, + &err); if (ret) { g_free(giommu); goto fail; @@ -645,7 +655,7 @@ static void vfio_listener_region_add(MemoryListener *listener, g_free(giommu); goto fail; } - QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + QLIST_INSERT_HEAD(&bcontainer->giommu_list, giommu, giommu_next); memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); return; @@ -659,7 +669,7 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_register_ram_discard_listener(container, section); + vfio_register_ram_discard_listener(bcontainer, section); return; } @@ -672,7 +682,7 @@ static void vfio_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (memory_region_is_ram_device(section->mr)) { - hwaddr pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + hwaddr pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) { trace_vfio_listener_region_add_no_dma_map( @@ -684,12 +694,12 @@ static void vfio_listener_region_add(MemoryListener *listener, } } - ret = vfio_dma_map(container, iova, int128_get64(llsize), - vaddr, section->readonly); + ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), + vaddr, section->readonly); if (ret) { - error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx", %p) = %d (%s)", - container, iova, int128_get64(llsize), vaddr, ret, + bcontainer, iova, int128_get64(llsize), vaddr, ret, strerror(-ret)); if (memory_region_is_ram_device(section->mr)) { /* Allow unexpected mappings not to be fatal for RAM devices */ @@ -711,9 +721,9 @@ fail: * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_propagate_prepend(&container->error, err, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_propagate_prepend(&bcontainer->error, err, "Region %s: ", memory_region_name(section->mr)); } else { @@ -728,7 +738,8 @@ fail: static void vfio_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); hwaddr iova, end; Int128 llend, llsize; int ret; @@ -741,7 +752,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { memory_region_unregister_iommu_notifier(section->mr, @@ -761,7 +772,8 @@ static void vfio_listener_region_del(MemoryListener *listener, */ } - if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) { + if (!vfio_get_section_iova_range(bcontainer, section, &iova, &end, + &llend)) { return; } @@ -772,10 +784,10 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_ram_device(section->mr)) { hwaddr pgmask; - pgmask = (1ULL << ctz64(container->pgsizes)) - 1; + pgmask = (1ULL << ctz64(bcontainer->pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); } else if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_unregister_ram_discard_listener(container, section); + vfio_unregister_ram_discard_listener(bcontainer, section); /* Unregistering will trigger an unmap. */ try_unmap = false; } @@ -784,27 +796,29 @@ static void vfio_listener_region_del(MemoryListener *listener, if (int128_eq(llsize, int128_2_64())) { /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } iova += int128_get64(llsize); } - ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, + int128_get64(llsize), NULL); if (ret) { - error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, int128_get64(llsize), ret, + bcontainer, iova, int128_get64(llsize), ret, strerror(-ret)); } } memory_region_unref(section->mr); - vfio_container_del_section_window(container, section); + vfio_container_del_section_window(bcontainer, section); } typedef struct VFIODirtyRanges { @@ -817,13 +831,13 @@ typedef struct VFIODirtyRanges { } VFIODirtyRanges; typedef struct VFIODirtyRangesListener { - VFIOContainer *container; + VFIOContainerBase *bcontainer; VFIODirtyRanges ranges; MemoryListener listener; } VFIODirtyRangesListener; static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { VFIOPCIDevice *pcidev; VFIODevice *vbasedev; @@ -831,7 +845,7 @@ static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, owner = memory_region_owner(section->mr); - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { continue; } @@ -854,7 +868,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, hwaddr iova, end, *min, *max; if (!vfio_listener_valid_section(section, "tracking_update") || - !vfio_get_section_iova_range(dirty->container, section, + !vfio_get_section_iova_range(dirty->bcontainer, section, &iova, &end, NULL)) { return; } @@ -878,7 +892,7 @@ static void vfio_dirty_tracking_update(MemoryListener *listener, * The alternative would be an IOVATree but that has a much bigger runtime * overhead and unnecessary complexity. */ - if (vfio_section_is_vfio_pci(section, dirty->container) && + if (vfio_section_is_vfio_pci(section, dirty->bcontainer) && iova >= UINT32_MAX) { min = &range->minpci64; max = &range->maxpci64; @@ -902,7 +916,7 @@ static const MemoryListener vfio_dirty_tracking_listener = { .region_add = vfio_dirty_tracking_update, }; -static void vfio_dirty_tracking_init(VFIOContainer *container, +static void vfio_dirty_tracking_init(VFIOContainerBase *bcontainer, VFIODirtyRanges *ranges) { VFIODirtyRangesListener dirty; @@ -912,10 +926,10 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, dirty.ranges.min64 = UINT64_MAX; dirty.ranges.minpci64 = UINT64_MAX; dirty.listener = vfio_dirty_tracking_listener; - dirty.container = container; + dirty.bcontainer = bcontainer; memory_listener_register(&dirty.listener, - container->space->as); + bcontainer->space->as); *ranges = dirty.ranges; @@ -927,7 +941,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container, memory_listener_unregister(&dirty.listener); } -static void vfio_devices_dma_logging_stop(VFIOContainer *container) +static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) { uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), sizeof(uint64_t))] = {}; @@ -938,7 +952,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) feature->flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (!vbasedev->dirty_tracking) { continue; } @@ -952,7 +966,7 @@ static void vfio_devices_dma_logging_stop(VFIOContainer *container) } static struct vfio_device_feature * -vfio_device_feature_dma_logging_start_create(VFIOContainer *container, +vfio_device_feature_dma_logging_start_create(VFIOContainerBase *bcontainer, VFIODirtyRanges *tracking) { struct vfio_device_feature *feature; @@ -1025,21 +1039,21 @@ static void vfio_device_feature_dma_logging_start_destroy( g_free(feature); } -static int vfio_devices_dma_logging_start(VFIOContainer *container) +static int vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer) { struct vfio_device_feature *feature; VFIODirtyRanges ranges; VFIODevice *vbasedev; int ret = 0; - vfio_dirty_tracking_init(container, &ranges); - feature = vfio_device_feature_dma_logging_start_create(container, + vfio_dirty_tracking_init(bcontainer, &ranges); + feature = vfio_device_feature_dma_logging_start_create(bcontainer, &ranges); if (!feature) { return -errno; } - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { if (vbasedev->dirty_tracking) { continue; } @@ -1056,7 +1070,7 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container) out: if (ret) { - vfio_devices_dma_logging_stop(container); + vfio_devices_dma_logging_stop(bcontainer); } vfio_device_feature_dma_logging_start_destroy(feature); @@ -1066,13 +1080,14 @@ out: static void vfio_listener_log_global_start(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; - if (vfio_devices_all_device_dirty_tracking(container)) { - ret = vfio_devices_dma_logging_start(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + ret = vfio_devices_dma_logging_start(bcontainer); } else { - ret = vfio_set_dirty_page_tracking(container, true); + ret = vfio_container_set_dirty_page_tracking(bcontainer, true); } if (ret) { @@ -1084,13 +1099,14 @@ static void vfio_listener_log_global_start(MemoryListener *listener) static void vfio_listener_log_global_stop(MemoryListener *listener) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret = 0; - if (vfio_devices_all_device_dirty_tracking(container)) { - vfio_devices_dma_logging_stop(container); + if (vfio_devices_all_device_dirty_tracking(bcontainer)) { + vfio_devices_dma_logging_stop(bcontainer); } else { - ret = vfio_set_dirty_page_tracking(container, false); + ret = vfio_container_set_dirty_page_tracking(bcontainer, false); } if (ret) { @@ -1126,14 +1142,14 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, return 0; } -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size) { VFIODevice *vbasedev; int ret; - QLIST_FOREACH(vbasedev, &container->device_list, container_next) { + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { ret = vfio_device_dma_logging_report(vbasedev, iova, size, vbmap->bitmap); if (ret) { @@ -1149,31 +1165,40 @@ int vfio_devices_query_dirty_bitmap(VFIOContainer *container, return 0; } -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, uint64_t size, ram_addr_t ram_addr) { bool all_device_dirty_tracking = - vfio_devices_all_device_dirty_tracking(container); + vfio_devices_all_device_dirty_tracking(bcontainer); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); uint64_t dirty_pages; VFIOBitmap vbmap; + VFIODMARange *qrange; int ret; - if (!container->dirty_pages_supported && !all_device_dirty_tracking) { + if (!bcontainer->dirty_pages_supported && !all_device_dirty_tracking) { cpu_physical_memory_set_dirty_range(ram_addr, size, tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE); return 0; } + qrange = vfio_lookup_match_range(container, iova, size); + /* the same as vfio_dma_unmap() */ + assert(qrange); + ret = vfio_bitmap_alloc(&vbmap, size); if (ret) { return ret; } + g_free(vbmap.bitmap); + vbmap.bitmap = qrange->bitmap; if (all_device_dirty_tracking) { - ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_devices_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } else { - ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size); + ret = vfio_container_query_dirty_bitmap(bcontainer, &vbmap, iova, size); } if (ret) { @@ -1183,11 +1208,8 @@ int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr, vbmap.pages); - trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size, - ram_addr, dirty_pages); + trace_vfio_get_dirty_bitmap(iova, size, vbmap.size, ram_addr, dirty_pages); out: - g_free(vbmap.bitmap); - return ret; } @@ -1201,7 +1223,7 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) vfio_giommu_dirty_notifier *gdn = container_of(n, vfio_giommu_dirty_notifier, n); VFIOGuestIOMMU *giommu = gdn->giommu; - VFIOContainer *container = giommu->container; + VFIOContainerBase *bcontainer = giommu->bcontainer; hwaddr iova = iotlb->iova + giommu->iommu_offset; ram_addr_t translated_addr; int ret = -EINVAL; @@ -1216,12 +1238,12 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { - ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, + ret = vfio_get_dirty_bitmap(bcontainer, iova, iotlb->addr_mask + 1, translated_addr); if (ret) { error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", - container, iova, iotlb->addr_mask + 1, ret, + bcontainer, iova, iotlb->addr_mask + 1, ret, strerror(-ret)); } } @@ -1246,16 +1268,17 @@ static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, * Sync the whole mapped region (spanning multiple individual mappings) * in one go. */ - return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); + return vfio_get_dirty_bitmap(vrdl->bcontainer, iova, size, ram_addr); } -static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, - MemoryRegionSection *section) +static int +vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); VFIORamDiscardListener *vrdl = NULL; - QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + QLIST_FOREACH(vrdl, &bcontainer->vrdl_list, next) { if (vrdl->mr == section->mr && vrdl->offset_within_address_space == section->offset_within_address_space) { @@ -1276,7 +1299,7 @@ static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, &vrdl); } -static int vfio_sync_dirty_bitmap(VFIOContainer *container, +static int vfio_sync_dirty_bitmap(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { ram_addr_t ram_addr; @@ -1284,7 +1307,7 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; - QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { Int128 llend; @@ -1308,13 +1331,13 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } return 0; } else if (memory_region_has_ram_discard_manager(section->mr)) { - return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); + return vfio_sync_ram_discard_listener_dirty_bitmap(bcontainer, section); } ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; - return vfio_get_dirty_bitmap(container, + return vfio_get_dirty_bitmap(bcontainer, REAL_HOST_PAGE_ALIGN(section->offset_within_address_space), int128_get64(section->size), ram_addr); } @@ -1322,15 +1345,16 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, static void vfio_listener_log_sync(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, listener); + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); int ret; if (vfio_listener_skipped_section(section)) { return; } - if (vfio_devices_all_dirty_tracking(container)) { - ret = vfio_sync_dirty_bitmap(container, section); + if (vfio_devices_all_dirty_tracking(bcontainer)) { + ret = vfio_sync_dirty_bitmap(bcontainer, section); if (ret) { error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret, strerror(-ret)); @@ -1339,6 +1363,144 @@ static void vfio_listener_log_sync(MemoryListener *listener, } } +/* + * I'm not sure if there's any alignment requirement for the CLEAR_BITMAP + * ioctl. But copy from kvm side and align {start, size} with 64 pages. + * + * I think the code can be simplified a lot if no alignment requirement. + */ +#define VFIO_CLEAR_LOG_SHIFT 6 +#define VFIO_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << VFIO_CLEAR_LOG_SHIFT) +#define VFIO_CLEAR_LOG_MASK (-VFIO_CLEAR_LOG_ALIGN) + +static int vfio_log_clear_one_range(VFIOContainer *container,VFIODMARange *qrange, + uint64_t start, uint64_t size) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + + /* + * Now let's deal with the actual bitmap, which is almost the same + * as the kvm side. + */ + uint64_t end, bmap_start, start_delta, bmap_npages; + unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size(); + int ret; + + bmap_start = start & VFIO_CLEAR_LOG_MASK; + start_delta = start - bmap_start; + bmap_start /= psize; + + bmap_npages = DIV_ROUND_UP(size + start_delta, VFIO_CLEAR_LOG_ALIGN) + << VFIO_CLEAR_LOG_SHIFT; + end = qrange->size / psize; + if (bmap_npages > end - bmap_start) { + bmap_npages = end - bmap_start; + } + start_delta /= psize; + + if (start_delta) { + bmap_clear = bitmap_new(bmap_npages); + bitmap_copy_with_src_offset(bmap_clear, qrange->bitmap, + bmap_start, start_delta + size / psize); + bitmap_clear(bmap_clear, 0, start_delta); + range->bitmap.data = (__u64 *)bmap_clear; + } else { + range->bitmap.data = (__u64 *)(qrange->bitmap + BIT_WORD(bmap_start)); + } + + range->iova = qrange->iova + bmap_start * psize; + range->size = bmap_npages * psize; + range->bitmap.size = ROUND_UP(bmap_npages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.pgsize = qemu_real_host_page_size(); + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to clear dirty log for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + bitmap_clear(qrange->bitmap, bmap_start + start_delta, size / psize); +err_out: + g_free(bmap_clear); + g_free(dbitmap); + return 0; +} + +static int vfio_physical_log_clear(VFIOContainer *container, + MemoryRegionSection *section) +{ + uint64_t start, size, offset, count; + VFIODMARange *qrange; + int ret = 0; + + if (!container->dirty_log_manual_clear) { + /* No need to do explicit clear */ + return ret; + } + + start = section->offset_within_address_space; + size = int128_get64(section->size); + + if (!size) { + return ret; + } + + QLIST_FOREACH(qrange, &container->dma_list, next) { + /* + * Discard ranges that do not overlap the section (e.g., the + * Memory BAR regions of the device) + */ + if (qrange->iova > start + size - 1 || + start > qrange->iova + qrange->size - 1) { + continue; + } + + if (start >= qrange->iova) { + /* The range starts before section or is aligned to it. */ + offset = start - qrange->iova; + count = MIN(qrange->size - offset, size); + } else { + /* The range starts after section. */ + offset = 0; + count = MIN(qrange->size, size - (qrange->iova - start)); + } + ret = vfio_log_clear_one_range(container, qrange, offset, count); + if (ret < 0) { + break; + } + } + + return ret; +} + +static void vfio_listener_log_clear(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + + if (vfio_listener_skipped_section(section) || + !bcontainer->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_dirty_tracking(bcontainer)) { + vfio_physical_log_clear(container, section); + } +} + const MemoryListener vfio_memory_listener = { .name = "vfio", .region_add = vfio_listener_region_add, @@ -1346,6 +1508,7 @@ const MemoryListener vfio_memory_listener = { .log_global_start = vfio_listener_log_global_start, .log_global_stop = vfio_listener_log_global_stop, .log_sync = vfio_listener_log_sync, + .log_clear = vfio_listener_log_clear, }; void vfio_reset_handler(void *opaque) @@ -1449,10 +1612,13 @@ VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) void vfio_put_address_space(VFIOAddressSpace *space) { - if (QLIST_EMPTY(&space->containers)) { - QLIST_REMOVE(space, list); - g_free(space); + if (!QLIST_EMPTY(&space->containers)) { + return; } + + QLIST_REMOVE(space, list); + g_free(space); + if (QLIST_EMPTY(&vfio_address_spaces)) { qemu_unregister_reset(vfio_reset_handler, NULL); } @@ -1481,3 +1647,53 @@ retry: return info; } + +int vfio_attach_device(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const VFIOIOMMUClass *ops = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); + HostIOMMUDevice *hiod = NULL; + HostIOMMUDeviceClass *hiod_ops = NULL; + int ret; + + if (vbasedev->iommufd) { + ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + } + + assert(ops); + + if (!vbasedev->mdev) { + hiod = HOST_IOMMU_DEVICE(object_new(ops->hiod_typename)); + hiod_ops = HOST_IOMMU_DEVICE_GET_CLASS(hiod); + vbasedev->hiod = hiod; + } + + ret = ops->attach_device(name, vbasedev, as, errp); + if (ret) { + goto err_attach; + } + + if (hiod_ops && hiod_ops->realize_late && + !hiod_ops->realize_late(hiod, vbasedev, errp)) { + ops->detach_device(vbasedev); + ret = -EINVAL; + goto err_attach; + } + + return 0; + +err_attach: + object_unref(hiod); + vbasedev->hiod = NULL; + return ret; +} + +void vfio_detach_device(VFIODevice *vbasedev) +{ + if (!vbasedev->bcontainer) { + return; + } + object_unref(vbasedev->hiod); + vbasedev->bcontainer->ops->detach_device(vbasedev); +} diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c new file mode 100644 index 0000000000000000000000000000000000000000..913ae49077c4f09b7b27517c1231cfbe4befb7fb --- /dev/null +++ b/hw/vfio/container-base.c @@ -0,0 +1,111 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/vfio/vfio-container-base.h" + +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly) +{ + g_assert(bcontainer->ops->dma_map); + return bcontainer->ops->dma_map(bcontainer, iova, size, vaddr, readonly); +} + +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + g_assert(bcontainer->ops->dma_unmap); + return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb); +} + +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) +{ + if (!bcontainer->ops->add_window) { + return 0; + } + + return bcontainer->ops->add_window(bcontainer, section, errp); +} + +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + if (!bcontainer->ops->del_window) { + return; + } + + return bcontainer->ops->del_window(bcontainer, section); +} + +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start) +{ + if (!bcontainer->dirty_pages_supported) { + return 0; + } + + g_assert(bcontainer->ops->set_dirty_page_tracking); + return bcontainer->ops->set_dirty_page_tracking(bcontainer, start); +} + +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) +{ + g_assert(bcontainer->ops->query_dirty_bitmap); + return bcontainer->ops->query_dirty_bitmap(bcontainer, vbmap, iova, size); +} + +void vfio_container_init(VFIOContainerBase *bcontainer, VFIOAddressSpace *space, + const VFIOIOMMUClass *ops) +{ + bcontainer->ops = ops; + bcontainer->space = space; + bcontainer->error = NULL; + bcontainer->dirty_pages_supported = false; + bcontainer->dma_max_mappings = 0; + bcontainer->iova_ranges = NULL; + QLIST_INIT(&bcontainer->giommu_list); + QLIST_INIT(&bcontainer->vrdl_list); +} + +void vfio_container_destroy(VFIOContainerBase *bcontainer) +{ + VFIOGuestIOMMU *giommu, *tmp; + + QLIST_REMOVE(bcontainer, next); + + QLIST_FOREACH_SAFE(giommu, &bcontainer->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier( + MEMORY_REGION(giommu->iommu_mr), &giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } + + g_list_free_full(bcontainer->iova_ranges, g_free); +} + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU, + .parent = TYPE_INTERFACE, + .class_size = sizeof(VFIOIOMMUClass), + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 242010036af33faa325a34008af40c2cc67a02ea..539cf34b208544b18deee120f8b16747c8ba1010 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -30,9 +30,12 @@ #include "qemu/error-report.h" #include "qemu/range.h" #include "sysemu/reset.h" +#include "sysemu/kvm.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" +#include "sysemu/kvm.h" +#include "pci.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -60,10 +63,11 @@ static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) } } -static int vfio_dma_unmap_bitmap(VFIOContainer *container, +static int vfio_dma_unmap_bitmap(const VFIOContainer *container, hwaddr iova, ram_addr_t size, IOMMUTLBEntry *iotlb) { + const VFIOContainerBase *bcontainer = &container->bcontainer; struct vfio_iommu_type1_dma_unmap *unmap; struct vfio_bitmap *bitmap; VFIOBitmap vbmap; @@ -91,7 +95,7 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container, bitmap->size = vbmap.size; bitmap->data = (__u64 *)vbmap.bitmap; - if (vbmap.size > container->max_dirty_bitmap_size) { + if (vbmap.size > bcontainer->max_dirty_bitmap_size) { error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size); ret = -E2BIG; goto unmap_exit; @@ -112,30 +116,73 @@ unmap_exit: return ret; } +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, + hwaddr start_addr, hwaddr size) +{ + VFIODMARange *qrange; + + QLIST_FOREACH(qrange, &container->dma_list, next) { + if (qrange->iova == start_addr && qrange->size == size) { + return qrange; + } + } + return NULL; +} + +void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange) +{ + uint64_t pages, size; + + pages = REAL_HOST_PAGE_ALIGN(qrange->size) / qemu_real_host_page_size(); + size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + qrange->bitmap = g_malloc0(size); +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb) +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), .flags = 0, .iova = iova, .size = size, }; + VFIODMARange *qrange; bool need_dirty_sync = false; int ret; - if (iotlb && vfio_devices_all_running_and_mig_active(container)) { - if (!vfio_devices_all_device_dirty_tracking(container) && - container->dirty_pages_supported) { + if (iotlb && vfio_devices_all_running_and_mig_active(bcontainer)) { + if (!vfio_devices_all_device_dirty_tracking(bcontainer) && + bcontainer->dirty_pages_supported) { return vfio_dma_unmap_bitmap(container, iova, size, iotlb); } need_dirty_sync = true; } + /* + * unregister the DMA range + * + * It seems that the memory layer will give us the same section as the one + * used in region_add(). Otherwise it'll be complicated to manipulate the + * bitmap across region_{add,del}. Is there any guarantee? + * + * But there is really not such a restriction on the kernel interface + * (VFIO_IOMMU_DIRTY_PAGES_FLAG_{UN}MAP_DMA, etc). + */ + qrange = vfio_lookup_match_range(container, iova, size); + assert(qrange); + g_free(qrange->bitmap); + QLIST_REMOVE(qrange, next); + g_free(qrange); + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -151,8 +198,8 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, */ if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) && container->iommu_type == VFIO_TYPE1v2_IOMMU) { - trace_vfio_dma_unmap_overflow_workaround(); - unmap.size -= 1ULL << ctz64(container->pgsizes); + trace_vfio_legacy_dma_unmap_overflow_workaround(); + unmap.size -= 1ULL << ctz64(bcontainer->pgsizes); continue; } error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); @@ -160,7 +207,7 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, } if (need_dirty_sync) { - ret = vfio_get_dirty_bitmap(container, iova, size, + ret = vfio_get_dirty_bitmap(bcontainer, iova, size, iotlb->translated_addr); if (ret) { return ret; @@ -170,9 +217,11 @@ int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, return 0; } -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly) +static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ, @@ -180,6 +229,14 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, .iova = iova, .size = size, }; + VFIODMARange *qrange; + + qrange = g_malloc0(sizeof(*qrange)); + qrange->iova = iova; + qrange->size = size; + QLIST_INSERT_HEAD(&container->dma_list, qrange, next); + /* XXX allocate the dirty bitmap on demand */ + vfio_dma_range_init_dirty_bitmap(qrange); if (!readonly) { map.flags |= VFIO_DMA_MAP_FLAG_WRITE; @@ -191,7 +248,8 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && + (errno == EBUSY && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -200,17 +258,17 @@ int vfio_dma_map(VFIOContainer *container, hwaddr iova, return -errno; } -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) +static int +vfio_legacy_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) { + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); int ret; struct vfio_iommu_type1_dirty_bitmap dirty = { .argsz = sizeof(dirty), }; - if (!container->dirty_pages_supported) { - return 0; - } - if (start) { dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; } else { @@ -227,9 +285,12 @@ int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start) return ret; } -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size) +static int vfio_legacy_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size) { + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); struct vfio_iommu_type1_dirty_bitmap *dbitmap; struct vfio_iommu_type1_dirty_bitmap_get *range; int ret; @@ -237,7 +298,9 @@ int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); - dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + dbitmap->flags = container->dirty_log_manual_clear ? + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR : + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; range->iova = iova; range->size = size; @@ -296,7 +359,7 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, } static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, - VFIOContainer *container) + VFIOContainerBase *bcontainer) { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_iova_range *cap; @@ -314,8 +377,8 @@ static bool vfio_get_info_iova_range(struct vfio_iommu_type1_info *info, range_set_bounds(range, cap->iova_ranges[i].start, cap->iova_ranges[i].end); - container->iova_ranges = - range_list_insert(container->iova_ranges, range); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); } return true; @@ -349,6 +412,14 @@ static int vfio_get_iommu_type(VFIOContainer *container, VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU }; int i; + if (virtcca_cvm_enabled()) { + if (ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_S_IOMMU)) { + return VFIO_TYPE1v2_S_IOMMU; + } else { + return -errno; + } + } + for (i = 0; i < ARRAY_SIZE(iommu_types); i++) { if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) { return iommu_types[i]; @@ -358,10 +429,35 @@ static int vfio_get_iommu_type(VFIOContainer *container, return -EINVAL; } +/* + * vfio_get_iommu_ops - get a VFIOIOMMUClass associated with a type + */ +static const VFIOIOMMUClass *vfio_get_iommu_class(int iommu_type, Error **errp) +{ + ObjectClass *klass = NULL; + + switch (iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_S_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_LEGACY); + break; + case VFIO_SPAPR_TCE_v2_IOMMU: + case VFIO_SPAPR_TCE_IOMMU: + klass = object_class_by_name(TYPE_VFIO_IOMMU_SPAPR); + break; + default: + g_assert_not_reached(); + }; + + return VFIO_IOMMU_CLASS(klass); +} + static int vfio_init_container(VFIOContainer *container, int group_fd, - Error **errp) + VFIOAddressSpace *space, Error **errp) { - int iommu_type, ret; + int iommu_type, dirty_log_manual_clear, ret; + const VFIOIOMMUClass *vioc; iommu_type = vfio_get_iommu_type(container, errp); if (iommu_type < 0) { @@ -390,6 +486,20 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, } container->iommu_type = iommu_type; + + dirty_log_manual_clear = ioctl(container->fd, VFIO_CHECK_EXTENSION, + VFIO_DIRTY_LOG_MANUAL_CLEAR); + if (dirty_log_manual_clear) { + container->dirty_log_manual_clear = dirty_log_manual_clear; + } + + vioc = vfio_get_iommu_class(iommu_type, errp); + if (!vioc) { + error_setg(errp, "No available IOMMU models"); + return -EINVAL; + } + + vfio_container_init(&container->bcontainer, space, vioc); return 0; } @@ -442,6 +552,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, { struct vfio_info_cap_header *hdr; struct vfio_iommu_type1_info_cap_migration *cap_mig; + VFIOContainerBase *bcontainer = &container->bcontainer; hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); if (!hdr) { @@ -456,22 +567,72 @@ static void vfio_get_iommu_info_migration(VFIOContainer *container, * qemu_real_host_page_size to mark those dirty. */ if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) { - container->dirty_pages_supported = true; - container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; - container->dirty_pgsizes = cap_mig->pgsize_bitmap; + bcontainer->dirty_pages_supported = true; + bcontainer->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + bcontainer->dirty_pgsizes = cap_mig->pgsize_bitmap; } } -static void vfio_free_container(VFIOContainer *container) +static SharedRegionListener *g_shl; + +static void shared_memory_listener_register(MemoryListener *listener, + AddressSpace *as) { - g_list_free_full(container->iova_ranges, g_free); - g_free(container); + SharedRegionListener *shl; + + shl = g_new0(SharedRegionListener, 1); + + shl->listener = listener; + shl->as = as; + + shared_region_register_listener(shl); + g_shl = shl; +} + +static void shared_memory_listener_unregister(void) +{ + SharedRegionListener *shl = g_shl; + + shared_region_unregister_listener(shl); + + g_free(shl); + g_shl = NULL; +} + +static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + g_autofree struct vfio_iommu_type1_info *info = NULL; + int ret; + + ret = vfio_get_iommu_info(container, &info); + if (ret) { + error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); + return ret; + } + + if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { + bcontainer->pgsizes = info->iova_pgsizes; + } else { + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + if (!vfio_get_info_dma_avail(info, &bcontainer->dma_max_mappings)) { + bcontainer->dma_max_mappings = 65535; + } + + vfio_get_info_iova_range(info, bcontainer); + + vfio_get_iommu_info_migration(container, info); + return 0; } static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret, fd; VFIOAddressSpace *space; @@ -508,7 +669,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ - QLIST_FOREACH(container, &space->containers, next) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { ret = vfio_ram_block_discard_disable(container, true); if (ret) { @@ -544,16 +706,11 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, } container = g_malloc0(sizeof(*container)); - container->space = space; container->fd = fd; - container->error = NULL; - container->dirty_pages_supported = false; - container->dma_max_mappings = 0; - container->iova_ranges = NULL; - QLIST_INIT(&container->giommu_list); - QLIST_INIT(&container->vrdl_list); - - ret = vfio_init_container(container, group->fd, errp); + QLIST_INIT(&container->dma_list); + bcontainer = &container->bcontainer; + + ret = vfio_init_container(container, group->fd, space, errp); if (ret) { goto free_container_exit; } @@ -564,82 +721,60 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } - switch (container->iommu_type) { - case VFIO_TYPE1v2_IOMMU: - case VFIO_TYPE1_IOMMU: - { - struct vfio_iommu_type1_info *info; + assert(bcontainer->ops->setup); - ret = vfio_get_iommu_info(container, &info); - if (ret) { - error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info"); - goto enable_discards_exit; - } - - if (info->flags & VFIO_IOMMU_INFO_PGSIZES) { - container->pgsizes = info->iova_pgsizes; - } else { - container->pgsizes = qemu_real_host_page_size(); - } - - if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) { - container->dma_max_mappings = 65535; - } - - vfio_get_info_iova_range(info, container); - - vfio_get_iommu_info_migration(container, info); - g_free(info); - break; - } - case VFIO_SPAPR_TCE_v2_IOMMU: - case VFIO_SPAPR_TCE_IOMMU: - { - ret = vfio_spapr_container_init(container, errp); - if (ret) { - goto enable_discards_exit; - } - break; - } + ret = bcontainer->ops->setup(bcontainer, errp); + if (ret) { + goto enable_discards_exit; } vfio_kvm_device_add_group(group); QLIST_INIT(&container->group_list); - QLIST_INSERT_HEAD(&space->containers, container, next); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); - container->listener = vfio_memory_listener; + if (kvm_csv3_enabled()) { + shared_memory_listener_register(&bcontainer->listener, + bcontainer->space->as); + } - memory_listener_register(&container->listener, container->space->as); + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); - if (container->error) { + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "memory listener initialization failed: "); goto listener_release_exit; } - container->initialized = true; + bcontainer->initialized = true; return 0; listener_release_exit: QLIST_REMOVE(group, container_next); - QLIST_REMOVE(container, next); + QLIST_REMOVE(bcontainer, next); vfio_kvm_device_del_group(group); - memory_listener_unregister(&container->listener); + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { + memory_listener_unregister(&bcontainer->listener); + } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } enable_discards_exit: vfio_ram_block_discard_disable(container, false); free_container_exit: - vfio_free_container(container); + g_free(container); close_fd_exit: close(fd); @@ -653,6 +788,7 @@ put_space_exit: static void vfio_disconnect_container(VFIOGroup *group) { VFIOContainer *container = group->container; + VFIOContainerBase *bcontainer = &container->bcontainer; QLIST_REMOVE(group, container_next); group->container = NULL; @@ -663,10 +799,16 @@ static void vfio_disconnect_container(VFIOGroup *group) * group. */ if (QLIST_EMPTY(&container->group_list)) { - memory_listener_unregister(&container->listener); + if (kvm_csv3_enabled()) { + shared_memory_listener_unregister(); + } else { + memory_listener_unregister(&bcontainer->listener); + } if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU || container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { - vfio_spapr_container_deinit(container); + if (bcontainer->ops->release) { + bcontainer->ops->release(bcontainer); + } } } @@ -676,21 +818,13 @@ static void vfio_disconnect_container(VFIOGroup *group) } if (QLIST_EMPTY(&container->group_list)) { - VFIOAddressSpace *space = container->space; - VFIOGuestIOMMU *giommu, *tmp; + VFIOAddressSpace *space = bcontainer->space; - QLIST_REMOVE(container, next); - - QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier( - MEMORY_REGION(giommu->iommu_mr), &giommu->n); - QLIST_REMOVE(giommu, giommu_next); - g_free(giommu); - } + vfio_container_destroy(bcontainer); trace_vfio_disconnect_container(container->fd); close(container->fd); - vfio_free_container(container); + g_free(container); vfio_put_address_space(space); } @@ -705,7 +839,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp) QLIST_FOREACH(group, &vfio_group_list, next) { if (group->groupid == groupid) { /* Found it. Now is it already in the right context? */ - if (group->container->space->as == as) { + if (group->container->bcontainer.space->as == as) { return group; } else { error_setg(errp, "group %d used in multiple address spaces", @@ -799,6 +933,11 @@ static int vfio_get_device(VFIOGroup *group, const char *name, return -1; } + if (!virtcca_cvm_enabled() && (info->flags & VFIO_DEVICE_FLAGS_SECURE)) { + error_setg(errp, "Normal vm cannot use confidential device."); + return -1; + } + /* * Set discarding of RAM as not broken for this group if the driver knows * the device operates compatibly with discarding. Setting must be @@ -877,13 +1016,13 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error **errp) * @name and @vbasedev->name are likely to be different depending * on the type of the device, hence the need for passing @name */ -int vfio_attach_device(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) { int groupid = vfio_device_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; int ret; if (groupid < 0) { @@ -892,6 +1031,10 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, trace_vfio_attach_device(vbasedev->name, groupid); + if (!vfio_device_hiod_realize(vbasedev, errp)) { + return false; + } + group = vfio_get_group(groupid, as, errp); if (!group) { return -ENOENT; @@ -910,26 +1053,251 @@ int vfio_attach_device(char *name, VFIODevice *vbasedev, return ret; } - container = group->container; - vbasedev->container = container; - QLIST_INSERT_HEAD(&container->device_list, vbasedev, container_next); + bcontainer = &group->container->bcontainer; + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); return ret; } -void vfio_detach_device(VFIODevice *vbasedev) +static void vfio_legacy_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - if (!vbasedev->container) { - return; - } - QLIST_REMOVE(vbasedev, global_next); QLIST_REMOVE(vbasedev, container_next); - vbasedev->container = NULL; + vbasedev->bcontainer = NULL; trace_vfio_detach_device(vbasedev->name, group->groupid); vfio_put_base_device(vbasedev); vfio_put_group(group); } + +static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + devices = &info->devices[0]; + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + trace_vfio_pci_hot_reset_dep_devices(host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %s, " + "depends on group %d which is not owned.", + vdev->vbasedev.name, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &vfio_group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIOPCIDevice *tmp; + VFIODevice *vbasedev_iter; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { + continue; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (!vbasedev_iter->dev->realized || + vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); + if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + +static void vfio_iommu_legacy_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO; + + vioc->setup = vfio_legacy_setup; + vioc->dma_map = vfio_legacy_dma_map; + vioc->dma_unmap = vfio_legacy_dma_unmap; + vioc->attach_device = vfio_legacy_attach_device; + vioc->detach_device = vfio_legacy_detach_device; + vioc->set_dirty_page_tracking = vfio_legacy_set_dirty_page_tracking; + vioc->query_dirty_bitmap = vfio_legacy_query_dirty_bitmap; + vioc->pci_hot_reset = vfio_legacy_pci_hot_reset; +}; + +static bool hiod_legacy_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + + hiod->name = g_strdup(vdev->name); + hiod->agent = opaque; + + return true; +} + +static int hiod_legacy_vfio_get_cap(HostIOMMUDevice *hiod, int cap, + Error **errp) +{ + switch (cap) { + case HOST_IOMMU_DEVICE_CAP_AW_BITS: + return vfio_device_get_aw_bits(hiod->agent); + default: + error_setg(errp, "%s: unsupported capability %x", hiod->name, cap); + return -EINVAL; + } +} + +static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc); + + hioc->realize = hiod_legacy_vfio_realize; + hioc->get_cap = hiod_legacy_vfio_get_cap; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_LEGACY, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_legacy_class_init, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE, + .class_init = hiod_legacy_vfio_class_init, + } +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/hct.c b/hw/vfio/hct.c new file mode 100644 index 0000000000000000000000000000000000000000..7fd397718201629cd054f9c03682ca6b49e4c598 --- /dev/null +++ b/hw/vfio/hct.c @@ -0,0 +1,551 @@ +/* + * vfio based mediated ccp(hct) assignment support + * + * Copyright 2023 HYGON Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include +#include +#include +#include +#include + +#include "qemu/osdep.h" +#include "qemu/queue.h" +#include "qemu/main-loop.h" +#include "qemu/log.h" +#include "trace.h" +#include "hw/pci/pci.h" +#include "hw/vfio/pci.h" +#include "qemu/range.h" +#include "sysemu/kvm.h" +#include "hw/pci/msi.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" + +#define MAX_CCP_CNT 48 +#define PAGE_SIZE 4096 +#define HCT_SHARED_MEMORY_SIZE (PAGE_SIZE * MAX_CCP_CNT) +#define CCP_INDEX_BYTES 4 +#define PATH_MAX 4096 +#define TYPE_HCT_DEV "hct" +#define PCI_HCT_DEV(obj) OBJECT_CHECK(HCTDevState, (obj), TYPE_HCT_DEV) +#define HCT_MMIO_SIZE (1 << 20) +#define HCT_MAX_PASID (1 << 8) + +#define PCI_VENDOR_ID_HYGON_CCP 0x1d94 +#define PCI_DEVICE_ID_HYGON_CCP 0x1468 + +#define HCT_SHARE_DEV "/dev/hct_share" + +#define HCT_VERSION_STRING "0.5" +#define DEF_VERSION_STRING "0.1" +#define VERSION_SIZE 16 + +#define HCT_SHARE_IOC_TYPE 'C' +#define HCT_SHARE_OP_TYPE 0x01 +#define HCT_SHARE_OP _IOWR(HCT_SHARE_IOC_TYPE, \ + HCT_SHARE_OP_TYPE, \ + struct hct_dev_ctrl) +#define HCT_SHARE_OP_DMA_MAP 0x01 +#define HCT_SHARE_OP_GET_ID 0x03 +#define HCT_SHARE_OP_GET_PASID 0x04 +#define HCT_SHARE_OP_DMA_UNMAP 0x05 +#define HCT_SHARE_OP_GET_VERSION 0x06 + +/* BARS */ +#define HCT_REG_BAR_IDX 2 +#define HCT_SHARED_BAR_IDX 3 +#define HCT_PASID_BAR_IDX 4 + +#define PASID_OFFSET 40 + +static volatile struct hct_data { + int init; + int hct_fd; + unsigned long pasid; + uint8_t *pasid_memory; + uint8_t *hct_shared_memory; + uint8_t ccp_index[MAX_CCP_CNT]; + uint8_t ccp_cnt; +} hct_data; + +typedef struct SharedDevice { + PCIDevice dev; + int shared_memory_offset; +} SharedDevice; + +typedef struct HctDevState { + SharedDevice sdev; + VFIODevice vdev; + MemoryRegion mmio; + MemoryRegion shared; + MemoryRegion pasid; + void *maps[PCI_NUM_REGIONS]; +} HCTDevState; + +struct hct_dev_ctrl { + unsigned char op; + unsigned char rsvd[3]; + union { + unsigned char version[VERSION_SIZE]; + struct { + unsigned long vaddr; + unsigned long iova; + unsigned long size; + }; + unsigned int id; + }; +}; + +static int pasid_get_and_init(HCTDevState *state) +{ + struct hct_dev_ctrl ctrl; + int ret; + + ctrl.op = HCT_SHARE_OP_GET_PASID; + ctrl.id = -1; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) { + ret = -errno; + error_report("GET_PASID fail: %d", -errno); + goto out; + } + + *hct_data.pasid_memory = ctrl.id; + hct_data.pasid = ctrl.id; + +out: + return ret; +} + +static const MemoryRegionOps hct_mmio_ops = { + .endianness = DEVICE_NATIVE_ENDIAN, + .valid = + { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static void vfio_hct_detach_device(HCTDevState *state) +{ + vfio_detach_device(&state->vdev); + + if (state->vdev.name) + g_free(state->vdev.name); +} + +static void vfio_hct_exit(PCIDevice *dev) +{ + HCTDevState *state = PCI_HCT_DEV(dev); + + vfio_hct_detach_device(state); + + if (hct_data.hct_fd) { + qemu_close(hct_data.hct_fd); + hct_data.hct_fd = 0; + } +} + +static Property vfio_hct_properties[] = { + DEFINE_PROP_STRING("sysfsdev", HCTDevState, vdev.sysfsdev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vfio_ccp_compute_needs_reset(VFIODevice *vdev) +{ + vdev->needs_reset = false; +} + +struct VFIODeviceOps vfio_ccp_ops = { + .vfio_compute_needs_reset = vfio_ccp_compute_needs_reset, +}; + +/* create BAR2, BAR3 and BAR4 space for the virtual machine. */ +static int vfio_hct_region_mmap(HCTDevState *state) +{ + int ret; + int i; + struct vfio_region_info *info; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + ret = vfio_get_region_info(&state->vdev, i, &info); + if (ret) + goto out; + + if (info->size) { + state->maps[i] = mmap(NULL, info->size, PROT_READ | PROT_WRITE, + MAP_SHARED, state->vdev.fd, info->offset); + if (state->maps[i] == MAP_FAILED) { + ret = -errno; + g_free(info); + error_report("vfio mmap fail\n"); + goto out; + } + } + g_free(info); + } + + memory_region_init_io(&state->mmio, OBJECT(state), &hct_mmio_ops, state, + "hct mmio", HCT_MMIO_SIZE); + memory_region_init_ram_device_ptr(&state->mmio, OBJECT(state), "hct mmio", + HCT_MMIO_SIZE, + state->maps[HCT_REG_BAR_IDX]); + + memory_region_init_io(&state->shared, OBJECT(state), &hct_mmio_ops, state, + "hct shared memory", PAGE_SIZE); + memory_region_init_ram_device_ptr( + &state->shared, OBJECT(state), "hct shared memory", PAGE_SIZE, + (void *)hct_data.hct_shared_memory + + state->sdev.shared_memory_offset * PAGE_SIZE); + + memory_region_init_io(&state->pasid, OBJECT(state), &hct_mmio_ops, state, + "hct pasid", PAGE_SIZE); + memory_region_init_ram_device_ptr(&state->pasid, OBJECT(state), "hct pasid", + PAGE_SIZE, hct_data.pasid_memory); + + pci_register_bar(&state->sdev.dev, HCT_REG_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->mmio); + pci_register_bar(&state->sdev.dev, HCT_SHARED_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->shared); + pci_register_bar(&state->sdev.dev, HCT_PASID_BAR_IDX, + PCI_BASE_ADDRESS_SPACE_MEMORY, &state->pasid); +out: + return ret; +} + +static int hct_check_duplicated_index(int index) +{ + int cnt; + for (cnt = 0; cnt < hct_data.ccp_cnt; cnt++) { + if (hct_data.ccp_index[cnt] == index) { + error_report("many mdev shouldn't be mapped to one ccp in a " + "virtual machine!\n"); + return -1; + } + } + + hct_data.ccp_index[hct_data.ccp_cnt++] = index; + return 0; +} + +static int hct_get_ccp_index(HCTDevState *state) +{ + char path[PATH_MAX] = {0}; + char buf[CCP_INDEX_BYTES] = {0}; + int fd; + int ret; + int ccp_index; + + snprintf(path, PATH_MAX, "%s/vendor/id", state->vdev.sysfsdev); + fd = qemu_open_old(path, O_RDONLY); + if (fd < 0) { + error_report("open %s fail\n", path); + return -errno; + } + + ret = read(fd, buf, sizeof(buf)); + if (ret < 0) { + ret = -errno; + error_report("read %s fail\n", path); + goto out; + } + + if (1 != sscanf(buf, "%d", &ccp_index)) { + ret = -errno; + error_report("format addr %s fail\n", buf); + goto out; + } + + if (!hct_check_duplicated_index(ccp_index)) { + state->sdev.shared_memory_offset = ccp_index; + } else { + ret = -1; + } + +out: + qemu_close(fd); + return ret; +} + +static int hct_api_version_check(void) +{ + struct hct_dev_ctrl ctrl; + int ret; + + ctrl.op = HCT_SHARE_OP_GET_VERSION; + memcpy(ctrl.version, DEF_VERSION_STRING, sizeof(DEF_VERSION_STRING)); + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) { + error_report("ret %d, errno %d: fail to get hct.ko version.\n", ret, + errno); + return -1; + } else if (memcmp(ctrl.version, HCT_VERSION_STRING, + sizeof(HCT_VERSION_STRING)) < 0) { + error_report("The hct.ko version is %s, please upgrade to version %s " + "or higher.\n", + ctrl.version, HCT_VERSION_STRING); + return -1; + } + + return 0; +} + +static int hct_shared_memory_init(void) +{ + int ret = 0; + + hct_data.hct_shared_memory = + mmap(NULL, HCT_SHARED_MEMORY_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + hct_data.hct_fd, 0); + if (hct_data.hct_shared_memory == MAP_FAILED) { + ret = -errno; + error_report("map hct shared memory fail\n"); + goto out; + } + +out: + return ret; +} + +static void hct_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct hct_dev_ctrl ctrl; + hwaddr iova; + Int128 llend, llsize; + void *vaddr; + int ret; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + if (!section->mr->ram) { + return; + } + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + llsize = int128_sub(llend, int128_make64(iova)); + + ctrl.op = HCT_SHARE_OP_DMA_MAP; + ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET); + ctrl.vaddr = (uint64_t)vaddr; + ctrl.size = llsize; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) + error_report("VFIO_MAP_DMA: %d, iova=%lx", -errno, iova); +} + +static void hct_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct hct_dev_ctrl ctrl; + hwaddr iova; + Int128 llend, llsize; + int ret; + + iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space); + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_add(llend, int128_exts64(qemu_real_host_page_mask())); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + if (!section->mr->ram) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + + ctrl.op = HCT_SHARE_OP_DMA_UNMAP; + ctrl.iova = iova | (hct_data.pasid << PASID_OFFSET); + ctrl.size = llsize; + ret = ioctl(hct_data.hct_fd, HCT_SHARE_OP, &ctrl); + if (ret < 0) + error_report("VFIO_UNMAP_DMA: %d", -errno); +} + +static MemoryListener hct_memory_listener = { + .region_add = hct_listener_region_add, + .region_del = hct_listener_region_del, +}; + +static void hct_data_uninit(HCTDevState *state) +{ + if (hct_data.hct_fd) { + qemu_close(hct_data.hct_fd); + hct_data.hct_fd = 0; + } + + if (hct_data.pasid) { + hct_data.pasid = 0; + } + + if (hct_data.pasid_memory) { + munmap(hct_data.pasid_memory, PAGE_SIZE); + hct_data.pasid_memory = NULL; + } + + if (hct_data.hct_shared_memory) { + munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE); + hct_data.hct_shared_memory = NULL; + } + + memory_listener_unregister(&hct_memory_listener); +} + +static int hct_data_init(HCTDevState *state) +{ + int ret; + + if (hct_data.init == 0) { + hct_data.hct_fd = qemu_open_old(HCT_SHARE_DEV, O_RDWR); + if (hct_data.hct_fd < 0) { + error_report("fail to open %s, errno %d.", HCT_SHARE_DEV, errno); + ret = -errno; + goto out; + } + + /* The hct.ko version number needs not to be less than 0.2. */ + ret = hct_api_version_check(); + if (ret) + goto out; + + /* assign a page to the virtual BAR3 of each CCP. */ + ret = hct_shared_memory_init(); + if (ret) + goto out; + + hct_data.pasid_memory = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (hct_data.pasid_memory < 0) + goto unmap_shared_memory_exit; + + /* assign a unique pasid to each virtual machine. */ + ret = pasid_get_and_init(state); + if (ret < 0) + goto unmap_pasid_memory_exit; + + /* perform DMA_MAP and DMA_UNMAP operations on all memories of the + * virtual machine. */ + memory_listener_register(&hct_memory_listener, &address_space_memory); + + hct_data.init = 1; + } + + return hct_get_ccp_index(state); + +unmap_pasid_memory_exit: + munmap(hct_data.pasid_memory, PAGE_SIZE); + +unmap_shared_memory_exit: + munmap((void *)hct_data.hct_shared_memory, HCT_SHARED_MEMORY_SIZE); + +out: + return ret; +} + +/* When device is loaded */ +static void vfio_hct_realize(PCIDevice *pci_dev, Error **errp) +{ + int ret; + char *mdevid; + HCTDevState *state = PCI_HCT_DEV(pci_dev); + + /* parsing mdev device name from startup scripts */ + mdevid = g_path_get_basename(state->vdev.sysfsdev); + state->vdev.name = g_strdup_printf("%s", mdevid); + + ret = hct_data_init(state); + if (ret < 0) { + g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "hct data init failed"); + goto out; + } + + ret = vfio_attach_device(state->vdev.name, &state->vdev, + pci_device_iommu_address_space(pci_dev), errp); + + if (ret) { + g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "attach device failed, name = %s", state->vdev.name); + goto data_uninit_out; + } + + state->vdev.ops = &vfio_ccp_ops; + state->vdev.dev = &state->sdev.dev.qdev; + + ret = vfio_hct_region_mmap(state); + if (ret < 0) + { + g_free(state->vdev.name); + state->vdev.name = NULL; + error_setg(errp, "region mmap failed, name = %s", state->vdev.name); + goto detach_device_out; + } + + return; + +detach_device_out: + vfio_hct_detach_device(state); + +data_uninit_out: + hct_data_uninit(state); + +out: + return; +} + +static void hct_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "HCT Device"; + device_class_set_props(dc, vfio_hct_properties); + + pdc->realize = vfio_hct_realize; + pdc->exit = vfio_hct_exit; + pdc->vendor_id = PCI_VENDOR_ID_HYGON_CCP; + pdc->device_id = PCI_DEVICE_ID_HYGON_CCP; + pdc->class_id = PCI_CLASS_CRYPT_OTHER; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + return; +} + +static const TypeInfo pci_hct_info = { + .name = TYPE_HCT_DEV, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(HCTDevState), + .class_init = hct_dev_class_init, + .interfaces = + (InterfaceInfo[]){ + {INTERFACE_CONVENTIONAL_PCI_DEVICE}, + {}, + }, +}; + +static void hct_register_types(void) { + type_register_static(&pci_hct_info); +} + +type_init(hct_register_types); diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 168847e7c51ef35afbea276745c1aa7e6cd94ce0..1f3bfed917a966a41f78422c8f03c3ace2dc9e3e 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -27,6 +27,7 @@ #include "trace.h" #include "qapi/error.h" #include "qemu/error-report.h" +#include "monitor/monitor.h" /* * Common VFIO interrupt disable @@ -609,3 +610,98 @@ bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) return ret; } + +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp) +{ + struct stat st; + + if (vbasedev->fd < 0) { + if (stat(vbasedev->sysfsdev, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + return -errno; + } + /* User may specify a name, e.g: VFIO platform device */ + if (!vbasedev->name) { + vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); + } + } else { + if (!vbasedev->iommufd) { + error_setg(errp, "Use FD passing only with iommufd backend"); + return -EINVAL; + } + /* + * Give a name with fd so any function printing out vbasedev->name + * will not break. + */ + if (!vbasedev->name) { + vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); + } + } + + return 0; +} + +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) +{ + int fd = monitor_fd_param(monitor_cur(), str, errp); + + if (fd < 0) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + vbasedev->fd = fd; +} + +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard) +{ + vbasedev->type = type; + vbasedev->ops = ops; + vbasedev->dev = dev; + vbasedev->fd = -1; + + vbasedev->ram_block_discard_allowed = ram_discard; +} + +int vfio_device_get_aw_bits(VFIODevice *vdev) +{ + /* + * iova_ranges is a sorted list. For old kernels that support + * VFIO but not support query of iova ranges, iova_ranges is NULL, + * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. + */ + GList *l = g_list_last(vdev->bcontainer->iova_ranges); + + if (l) { + Range *range = l->data; + return range_get_last_bit(range) + 1; + } + + return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; +} + +bool vfio_device_is_mdev(VFIODevice *vbasedev) +{ + g_autofree char *subsys = NULL; + g_autofree char *tmp = NULL; + + if (!vbasedev->sysfsdev) { + return false; + } + + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); + subsys = realpath(tmp, NULL); + return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); +} + +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) +{ + HostIOMMUDevice *hiod = vbasedev->hiod; + + if (!hiod) { + return true; + } + + return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c new file mode 100644 index 0000000000000000000000000000000000000000..a108beda29e248b4188ba2c74db915f54865e854 --- /dev/null +++ b/hw/vfio/iommufd.c @@ -0,0 +1,916 @@ +/* + * iommufd container backend + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include +#include +#include + +#include "hw/vfio/vfio-common.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qapi/error.h" +#include "sysemu/iommufd.h" +#include "hw/qdev-core.h" +#include "sysemu/reset.h" +#include "qemu/cutils.h" +#include "qemu/chardev_open.h" +#include "pci.h" +#include "exec/ram_addr.h" + +static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + return iommufd_backend_map_dma(container->be, + container->ioas_id, + iova, size, vaddr, readonly); +} + +static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ + return iommufd_backend_unmap_dma(container->be, + container->ioas_id, iova, size); +} + +static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp) +{ + return vfio_kvm_device_add_fd(vbasedev->fd, errp); +} + +static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev) +{ + Error *err = NULL; + + if (vfio_kvm_device_del_fd(vbasedev->fd, &err)) { + error_report_err(err); + } +} + +static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp) +{ + IOMMUFDBackend *iommufd = vbasedev->iommufd; + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + int ret; + + ret = iommufd_backend_connect(iommufd, errp); + if (ret) { + return ret; + } + + /* + * Add device to kvm-vfio to be prepared for the tracking + * in KVM. Especially for some emulated devices, it requires + * to have kvm information in the device open. + */ + ret = iommufd_cdev_kvm_device_add(vbasedev, errp); + if (ret) { + goto err_kvm_device_add; + } + + /* Bind device to iommufd */ + bind.iommufd = iommufd->fd; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + if (ret) { + error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d", + vbasedev->fd, bind.iommufd); + goto err_bind; + } + + vbasedev->devid = bind.out_devid; + trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name, + vbasedev->fd, vbasedev->devid); + return ret; +err_bind: + iommufd_cdev_kvm_device_del(vbasedev); +err_kvm_device_add: + iommufd_backend_disconnect(iommufd); + return ret; +} + +static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev) +{ + /* Unbind is automatically conducted when device fd is closed */ + iommufd_cdev_kvm_device_del(vbasedev); + iommufd_backend_disconnect(vbasedev->iommufd); +} + +static bool iommufd_hwpt_dirty_tracking(VFIOIOASHwpt *hwpt) +{ + return hwpt && hwpt->hwpt_flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; +} + +static int iommufd_set_dirty_page_tracking(const VFIOContainerBase *bcontainer, + bool start) +{ + const VFIOIOMMUFDContainer *container = + container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, start, NULL)) { + error_report("Failed to set dirty tracking hwpt_id %u errno: %d", + hwpt->hwpt_id, errno); + goto err; + } + } + + return 0; + +err: + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + iommufd_backend_set_dirty_tracking(container->be, + hwpt->hwpt_id, !start, NULL); + } + return -EINVAL; +} + +static int iommufd_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, hwaddr iova, + hwaddr size) +{ + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + unsigned long page_size = qemu_real_host_page_size(); + VFIOIOASHwpt *hwpt; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + if (!iommufd_hwpt_dirty_tracking(hwpt)) { + continue; + } + + if (!iommufd_backend_get_dirty_bitmap(container->be, hwpt->hwpt_id, + iova, size, page_size, + (uint64_t *)vbmap->bitmap, + NULL)) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)iova, + (uint64_t)size, errno); + + return -EINVAL; + } + } + + return 0; +} + +static int iommufd_cdev_getfd(const char *sysfs_path, Error **errp) +{ + long int ret = -ENOTTY; + char *path, *vfio_dev_path = NULL, *vfio_path = NULL; + DIR *dir = NULL; + struct dirent *dent; + gchar *contents; + struct stat st; + gsize length; + int major, minor; + dev_t vfio_devt; + + path = g_strdup_printf("%s/vfio-dev", sysfs_path); + if (stat(path, &st) < 0) { + error_setg_errno(errp, errno, "no such host device"); + goto out_free_path; + } + + dir = opendir(path); + if (!dir) { + error_setg_errno(errp, errno, "couldn't open directory %s", path); + goto out_free_path; + } + + while ((dent = readdir(dir))) { + if (!strncmp(dent->d_name, "vfio", 4)) { + vfio_dev_path = g_strdup_printf("%s/%s/dev", path, dent->d_name); + break; + } + } + + if (!vfio_dev_path) { + error_setg(errp, "failed to find vfio-dev/vfioX/dev"); + goto out_close_dir; + } + + if (!g_file_get_contents(vfio_dev_path, &contents, &length, NULL)) { + error_setg(errp, "failed to load \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + + if (sscanf(contents, "%d:%d", &major, &minor) != 2) { + error_setg(errp, "failed to get major:minor for \"%s\"", vfio_dev_path); + goto out_free_dev_path; + } + g_free(contents); + vfio_devt = makedev(major, minor); + + vfio_path = g_strdup_printf("/dev/vfio/devices/%s", dent->d_name); + ret = open_cdev(vfio_path, vfio_devt); + if (ret < 0) { + error_setg(errp, "Failed to open %s", vfio_path); + } + + trace_iommufd_cdev_getfd(vfio_path, ret); + g_free(vfio_path); + +out_free_dev_path: + g_free(vfio_dev_path); +out_close_dir: + closedir(dir); +out_free_path: + if (*errp) { + error_prepend(errp, VFIO_MSG_PREFIX, path); + } + g_free(path); + + return ret; +} + +static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id, + Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + .pt_id = id, + }; + + /* Attach device to an IOAS or hwpt within iommufd */ + ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + if (ret) { + error_setg_errno(errp, errno, + "[iommufd=%d] error attach %s (%d) to id=%d", + iommufd, vbasedev->name, vbasedev->fd, id); + return -errno; + } + + trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name, + vbasedev->fd, id); + return 0; +} + +static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp) +{ + int ret, iommufd = vbasedev->iommufd->fd; + struct vfio_device_detach_iommufd_pt detach_data = { + .argsz = sizeof(detach_data), + .flags = 0, + }; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data); + if (ret) { + error_setg_errno(errp, errno, "detach %s failed", vbasedev->name); + } else { + trace_iommufd_cdev_detach_ioas_hwpt(iommufd, vbasedev->name); + } + return ret; +} + +static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + ERRP_GUARD(); + IOMMUFDBackend *iommufd = vbasedev->iommufd; + uint32_t flags = 0; + VFIOIOASHwpt *hwpt; + uint32_t hwpt_id; + int ret; + + /* Try to find a domain */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + /* -EINVAL means the domain is incompatible with the device. */ + if (ret == -EINVAL) { + /* + * It is an expected failure and it just means we will try + * another domain, or create one if no existing compatible + * domain is found. Hence why the error is discarded below. + */ + error_free(*errp); + *errp = NULL; + continue; + } + + return ret; + } else { + vbasedev->hwpt = hwpt; + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + return 0; + } + } + + /* + * This is quite early and VFIO Migration state isn't yet fully + * initialized, thus rely only on IOMMU hardware capabilities as to + * whether IOMMU dirty tracking is going to be requested. Later + * vfio_migration_realize() may decide to use VF dirty tracking + * instead. + */ + if (vbasedev->hiod->caps.hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { + flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + } + + if (!iommufd_backend_alloc_hwpt(iommufd, vbasedev->devid, + container->ioas_id, flags, + IOMMU_HWPT_DATA_NONE, 0, NULL, + &hwpt_id, NULL, errp)) { + return -EINVAL; + } + + hwpt = g_malloc0(sizeof(*hwpt)); + hwpt->hwpt_id = hwpt_id; + hwpt->hwpt_flags = flags; + QLIST_INIT(&hwpt->device_list); + + ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, errp); + if (ret) { + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + return ret; + } + + vbasedev->hwpt = hwpt; + vbasedev->iommu_dirty_tracking = iommufd_hwpt_dirty_tracking(hwpt); + QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next); + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + container->bcontainer.dirty_pages_supported |= + vbasedev->iommu_dirty_tracking; + if (container->bcontainer.dirty_pages_supported && + !vbasedev->iommu_dirty_tracking) { + warn_report("IOMMU instance for device %s doesn't support dirty tracking", + vbasedev->name); + } + return 0; +} + +static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + VFIOIOASHwpt *hwpt = vbasedev->hwpt; + + QLIST_REMOVE(vbasedev, hwpt_next); + vbasedev->hwpt = NULL; + + if (QLIST_EMPTY(&hwpt->device_list)) { + QLIST_REMOVE(hwpt, next); + iommufd_backend_free_id(container->be, hwpt->hwpt_id); + g_free(hwpt); + } +} + +static int iommufd_cdev_attach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container, + Error **errp) +{ + /* mdevs aren't physical devices and will fail with auto domains */ + if (!vbasedev->mdev) { + return iommufd_cdev_autodomains_get(vbasedev, container, errp); + } + + return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp); +} + +static void iommufd_cdev_detach_container(VFIODevice *vbasedev, + VFIOIOMMUFDContainer *container) +{ + Error *err = NULL; + + if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) { + error_report_err(err); + } + + if (vbasedev->hwpt) { + iommufd_cdev_autodomains_put(vbasedev, container); + } + +} + +static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (!QLIST_EMPTY(&bcontainer->device_list)) { + return; + } + memory_listener_unregister(&bcontainer->listener); + vfio_container_destroy(bcontainer); + iommufd_backend_free_id(container->be, container->ioas_id); + g_free(container); +} + +static int iommufd_cdev_ram_block_discard_disable(bool state) +{ + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); +} + +static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container, + uint32_t ioas_id, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + struct iommu_ioas_iova_ranges *info; + struct iommu_iova_range *iova_ranges; + int ret, sz, fd = container->be->fd; + + info = g_malloc0(sizeof(*info)); + info->size = sizeof(*info); + info->ioas_id = ioas_id; + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret && errno != EMSGSIZE) { + goto error; + } + + sz = info->num_iovas * sizeof(struct iommu_iova_range); + info = g_realloc(info, sizeof(*info) + sz); + info->allowed_iovas = (uintptr_t)(info + 1); + + ret = ioctl(fd, IOMMU_IOAS_IOVA_RANGES, info); + if (ret) { + goto error; + } + + iova_ranges = (struct iommu_iova_range *)(uintptr_t)info->allowed_iovas; + + for (int i = 0; i < info->num_iovas; i++) { + Range *range = g_new(Range, 1); + + range_set_bounds(range, iova_ranges[i].start, iova_ranges[i].last); + bcontainer->iova_ranges = + range_list_insert(bcontainer->iova_ranges, range); + } + bcontainer->pgsizes = info->out_iova_alignment; + + g_free(info); + return 0; + +error: + ret = -errno; + g_free(info); + error_setg_errno(errp, errno, "Cannot get IOVA ranges"); + return ret; +} + +static int iommufd_cdev_attach(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + VFIOContainerBase *bcontainer; + VFIOIOMMUFDContainer *container; + VFIOAddressSpace *space; + struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; + int ret, devfd; + uint32_t ioas_id; + Error *err = NULL; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + + if (vbasedev->fd < 0) { + devfd = iommufd_cdev_getfd(vbasedev->sysfsdev, errp); + if (devfd < 0) { + return devfd; + } + vbasedev->fd = devfd; + } else { + devfd = vbasedev->fd; + } + + ret = iommufd_cdev_connect_and_bind(vbasedev, errp); + if (ret) { + goto err_connect_bind; + } + + space = vfio_get_address_space(as); + + /* + * The HostIOMMUDevice data from legacy backend is static and doesn't need + * any information from the (type1-iommu) backend to be initialized. In + * contrast however, the IOMMUFD HostIOMMUDevice data requires the iommufd + * FD to be connected and having a devid to be able to successfully call + * iommufd_backend_get_device_info(). + */ + if (!vfio_device_hiod_realize(vbasedev, errp)) { + goto err_alloc_ioas; + } + + /* try to attach to an existing container in this space */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + if (bcontainer->ops != iommufd_vioc || + vbasedev->iommufd != container->be) { + continue; + } + if (iommufd_cdev_attach_container(vbasedev, container, &err)) { + const char *msg = error_get_pretty(err); + + trace_iommufd_cdev_fail_attach_existing_container(msg); + error_free(err); + err = NULL; + } else { + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + error_setg(errp, + "Cannot set discarding of RAM broken (%d)", ret); + goto err_discard_disable; + } + goto found_container; + } + } + + /* Need to allocate a new dedicated container */ + ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp); + if (ret < 0) { + goto err_alloc_ioas; + } + + trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id); + + container = g_malloc0(sizeof(*container)); + container->be = vbasedev->iommufd; + container->ioas_id = ioas_id; + QLIST_INIT(&container->hwpt_list); + + bcontainer = &container->bcontainer; + vfio_container_init(bcontainer, space, iommufd_vioc); + QLIST_INSERT_HEAD(&space->containers, bcontainer, next); + + ret = iommufd_cdev_attach_container(vbasedev, container, errp); + if (ret) { + goto err_attach_container; + } + + ret = iommufd_cdev_ram_block_discard_disable(true); + if (ret) { + goto err_discard_disable; + } + + ret = iommufd_cdev_get_info_iova_range(container, ioas_id, &err); + if (ret) { + error_append_hint(&err, + "Fallback to default 64bit IOVA range and 4K page size\n"); + warn_report_err(err); + err = NULL; + bcontainer->pgsizes = qemu_real_host_page_size(); + } + + bcontainer->listener = vfio_memory_listener; + memory_listener_register(&bcontainer->listener, bcontainer->space->as); + + if (bcontainer->error) { + ret = -1; + error_propagate_prepend(errp, bcontainer->error, + "memory listener initialization failed: "); + goto err_listener_register; + } + + bcontainer->initialized = true; + +found_container: + ret = ioctl(devfd, VFIO_DEVICE_GET_INFO, &dev_info); + if (ret) { + error_setg_errno(errp, errno, "error getting device info"); + goto err_listener_register; + } + + /* + * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level + * for discarding incompatibility check as well? + */ + if (vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + vbasedev->group = 0; + vbasedev->num_irqs = dev_info.num_irqs; + vbasedev->num_regions = dev_info.num_regions; + vbasedev->flags = dev_info.flags; + vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, + vbasedev->num_regions, vbasedev->flags); + return 0; + +err_listener_register: + iommufd_cdev_ram_block_discard_disable(false); +err_discard_disable: + iommufd_cdev_detach_container(vbasedev, container); +err_attach_container: + iommufd_cdev_container_destroy(container); +err_alloc_ioas: + vfio_put_address_space(space); + iommufd_cdev_unbind_and_disconnect(vbasedev); +err_connect_bind: + close(vbasedev->fd); + return ret; +} + +static void iommufd_cdev_detach(VFIODevice *vbasedev) +{ + VFIOContainerBase *bcontainer = vbasedev->bcontainer; + VFIOAddressSpace *space = bcontainer->space; + VFIOIOMMUFDContainer *container = container_of(bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + QLIST_REMOVE(vbasedev, global_next); + QLIST_REMOVE(vbasedev, container_next); + vbasedev->bcontainer = NULL; + + if (!vbasedev->ram_block_discard_allowed) { + iommufd_cdev_ram_block_discard_disable(false); + } + + iommufd_cdev_detach_container(vbasedev, container); + iommufd_cdev_container_destroy(container); + vfio_put_address_space(space); + + iommufd_cdev_unbind_and_disconnect(vbasedev); + close(vbasedev->fd); +} + +static VFIODevice *iommufd_cdev_pci_find_by_devid(__u32 devid) +{ + VFIODevice *vbasedev_iter; + const VFIOIOMMUClass *iommufd_vioc = + VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); + + QLIST_FOREACH(vbasedev_iter, &vfio_device_list, global_next) { + if (vbasedev_iter->bcontainer->ops != iommufd_vioc) { + continue; + } + if (devid == vbasedev_iter->devid) { + return vbasedev_iter; + } + } + return NULL; +} + +static VFIOPCIDevice * +iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device *dep_dev, + VFIODevice *reset_dev) +{ + VFIODevice *vbasedev_tmp; + + if (dep_dev->devid == reset_dev->devid || + dep_dev->devid == VFIO_PCI_DEVID_OWNED) { + return NULL; + } + + vbasedev_tmp = iommufd_cdev_pci_find_by_devid(dep_dev->devid); + if (!vbasedev_tmp || !vbasedev_tmp->dev->realized || + vbasedev_tmp->type != VFIO_DEVICE_TYPE_PCI) { + return NULL; + } + + return container_of(vbasedev_tmp, VFIOPCIDevice, vbasedev); +} + +static int iommufd_cdev_pci_hot_reset(VFIODevice *vbasedev, bool single) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + struct vfio_pci_hot_reset_info *info = NULL; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int ret, i; + bool multi = false; + + trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + + if (!single) { + vfio_pci_pre_reset(vdev); + } + vdev->vbasedev.needs_reset = false; + + ret = vfio_pci_get_pci_hot_reset_info(vdev, &info); + + if (ret) { + goto out_single; + } + + assert(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID); + + devices = &info->devices[0]; + + if (!(info->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED)) { + if (!vdev->has_pm_reset) { + for (i = 0; i < info->count; i++) { + if (devices[i].devid == VFIO_PCI_DEVID_NOT_OWNED) { + error_report("vfio: Cannot reset device %s, " + "depends on device %04x:%02x:%02x.%x " + "which is not owned.", + vdev->vbasedev.name, devices[i].segment, + devices[i].bus, PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn)); + } + } + } + ret = -EPERM; + goto out_single; + } + + trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); + + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + trace_iommufd_cdev_pci_hot_reset_dep_devices(devices[i].segment, + devices[i].bus, + PCI_SLOT(devices[i].devfn), + PCI_FUNC(devices[i].devfn), + devices[i].devid); + + /* + * If a VFIO cdev device is resettable, all the dependent devices + * are either bound to same iommufd or within same iommu_groups as + * one of the iommufd bound devices. + */ + assert(devices[i].devid != VFIO_PCI_DEVID_NOT_OWNED); + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + + if (single) { + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->vbasedev.needs_reset = false; + multi = true; + } + + if (!single && !multi) { + ret = -EINVAL; + goto out_single; + } + + /* Use zero length array for hot reset with iommufd backend */ + reset = g_malloc0(sizeof(*reset)); + reset->argsz = sizeof(*reset); + + /* Bus reset! */ + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + if (ret) { + ret = -errno; + } + + trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, + ret ? strerror(errno) : "Success"); + + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + VFIOPCIDevice *tmp; + + tmp = iommufd_cdev_dep_get_realized_vpdev(&devices[i], &vdev->vbasedev); + if (!tmp) { + continue; + } + vfio_pci_post_reset(tmp); + } +out_single: + if (!single) { + vfio_pci_post_reset(vdev); + } + g_free(info); + + return ret; +} + +static void vfio_iommu_iommufd_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->hiod_typename = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO; + + vioc->dma_map = iommufd_cdev_map; + vioc->dma_unmap = iommufd_cdev_unmap; + vioc->attach_device = iommufd_cdev_attach; + vioc->detach_device = iommufd_cdev_detach; + vioc->pci_hot_reset = iommufd_cdev_pci_hot_reset; + vioc->set_dirty_page_tracking = iommufd_set_dirty_page_tracking; + vioc->query_dirty_bitmap = iommufd_query_dirty_bitmap; +}; + +static bool +host_iommu_device_iommufd_vfio_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt_id, errp); +} + +static bool +host_iommu_device_iommufd_vfio_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp) +{ + VFIODevice *vbasedev = HOST_IOMMU_DEVICE(idev)->agent; + + return !iommufd_cdev_detach_ioas_hwpt(vbasedev, errp); +} + +static bool hiod_iommufd_vfio_realize_late(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + VFIOIOMMUFDContainer *container = container_of(vdev->bcontainer, + VFIOIOMMUFDContainer, + bcontainer); + HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(hiod); + + idev->iommufd = vdev->iommufd; + idev->devid = vdev->devid; + idev->ioas_id = container->ioas_id; + + return true; +} + +static bool hiod_iommufd_vfio_realize(HostIOMMUDevice *hiod, void *opaque, + Error **errp) +{ + VFIODevice *vdev = opaque; + HostIOMMUDeviceCaps *caps = &hiod->caps; + enum iommu_hw_info_type type; + union { + struct iommu_hw_info_vtd vtd; + } data; + uint64_t hw_caps; + uint8_t pasids; + + hiod->agent = opaque; + + if (!iommufd_backend_get_device_info(vdev->iommufd, vdev->devid, + &type, &data, sizeof(data), + &hw_caps, &pasids, errp)) { + return false; + } + + hiod->name = g_strdup(vdev->name); + caps->type = type; + caps->hw_caps = hw_caps; + caps->max_pasid_log2 = pasids; + + return true; +} + +static void hiod_iommufd_vfio_class_init(ObjectClass *oc, void *data) +{ + HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc); + HostIOMMUDeviceIOMMUFDClass *idevc = HOST_IOMMU_DEVICE_IOMMUFD_CLASS(oc); + + hiodc->realize = hiod_iommufd_vfio_realize; + hiodc->realize_late = hiod_iommufd_vfio_realize_late; + + idevc->attach_hwpt = host_iommu_device_iommufd_vfio_attach_hwpt; + idevc->detach_hwpt = host_iommu_device_iommufd_vfio_detach_hwpt; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_IOMMUFD, + .parent = TYPE_VFIO_IOMMU, + .class_init = vfio_iommu_iommufd_class_init, + }, { + .name = TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO, + .parent = TYPE_HOST_IOMMU_DEVICE_IOMMUFD, + .class_init = hiod_iommufd_vfio_class_init, + } +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 2a6912c94027d6213144f910d64625a469cc2b1f..bda2688983e9e7013dcd12f6696253e835ded8ef 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,10 +2,14 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'helpers.c', 'common.c', + 'container-base.c', 'container.c', - 'spapr.c', 'migration.c', )) +vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) +vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( + 'iommufd.c', +)) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', 'pci-quirks.c', @@ -17,5 +21,6 @@ vfio_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) vfio_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c')) vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c')) +vfio_ss.add(when: 'CONFIG_VFIO_HCT', if_true: files('hct.c')) specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 28d422b39f9f70e94a2f396b0fb064c5de17dc28..3924beb2893fa642add56a07f79fc89b63fb1209 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -945,16 +945,18 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) return !vfio_block_migration(vbasedev, err, errp); } - if (!vbasedev->dirty_pages_supported) { + if ((!vbasedev->dirty_pages_supported || + vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && + !vbasedev->iommu_dirty_tracking) { if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { error_setg(&err, - "%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + "%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); goto add_blocker; } - warn_report("%s: VFIO device doesn't support device dirty tracking", - vbasedev->name); + warn_report("%s: VFIO device doesn't support device and " + "IOMMU dirty tracking", vbasedev->name); } ret = vfio_block_multiple_devices_migration(vbasedev, errp); diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 84b1a7b9485c5c1758179ffe7ce6126d1f240f5d..a71ebe26b427e6ca2ceb383b5d5adfe3460cfb91 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1209,6 +1209,190 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, return 0; } +#define PCI_VENDOR_ID_HUAWEI 0x19e5 +#define PCI_DEVICE_ID_ASCEND910 0xd801 +#define PCI_DEVICE_ID_ASCEND710 0xd500 +#define PCI_DEVICE_ID_ASCEND310 0xd100 +#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN 0x100 +#define PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX 0x10f +#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN 0x110 +#define PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX 0x11f +#define ASCEND910_XLOADER_SIZE 4 +#define ASCEND910_XLOADER_OFFSET 0x80400 +#define ASCEND710_2P_BASE (128 * 1024 * 1024) +#define ASCEND710_1P_DEVNUM 1 +#define ASCEND710_2P_DEVNUM 2 +#define ASCEND710_XLOADER_SIZE 4 +#define ASCEND710_XLOADER_OFFSET 0x100430 +#define ASCEND310_XLOADER_SIZE 4 +#define ASCEND310_XLOADER_OFFSET 0x400 + +typedef struct VFIOAscendBarQuirk { + struct VFIOPCIDevice *vdev; + pcibus_t offset; + uint8_t bar; + MemoryRegion *mem; +} VFIOAscendBarQuirk; + +static uint64_t vfio_ascend_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOAscendBarQuirk *quirk = opaque; + VFIOPCIDevice *vdev = quirk->vdev; + + qemu_log("read RO region! addr=0x%" HWADDR_PRIx ", size=%d\n", + addr + quirk->offset, size); + + return vfio_region_read(&vdev->bars[quirk->bar].region, + addr + quirk->offset, size); +} + +static void vfio_ascend_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOAscendBarQuirk *quirk = opaque; + + qemu_log("modifying RO region is not allowed! addr=0x%" + HWADDR_PRIx ", data=0x%" PRIx64 ", size=%d\n", + addr + quirk->offset, data, size); +} + +static const MemoryRegionOps vfio_ascend_intercept_regs_quirk = { + .read = vfio_ascend_quirk_read, + .write = vfio_ascend_quirk_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_ascend910_bar0_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar0_quirk; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 0 || + vdev->device_id != PCI_DEVICE_ID_ASCEND910) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = 1; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar0_quirk = quirk->data = g_new0(typeof(*bar0_quirk), quirk->nr_mem); + bar0_quirk[0].vdev = vdev; + bar0_quirk[0].offset = ASCEND910_XLOADER_OFFSET; + bar0_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar0_quirk[0], + "vfio-ascend910-bar0-intercept-regs-quirk", + ASCEND910_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar0_quirk[0].offset, + &quirk->mem[0], 1); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + +static void vfio_probe_ascend710_bar2_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar2_quirk; + int sub_device_id; + int devnum = 0; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 2 || + vdev->device_id != PCI_DEVICE_ID_ASCEND710) { + return; + } + + sub_device_id = pci_get_word(vdev->pdev.config + PCI_SUBSYSTEM_ID); + if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_1P_MIN && + sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_1P_MAX) { + devnum = ASCEND710_1P_DEVNUM; + } else if (sub_device_id >= PCI_SUB_DEVICE_ID_ASCEND710_2P_MIN && + sub_device_id <= PCI_SUB_DEVICE_ID_ASCEND710_2P_MAX) { + devnum = ASCEND710_2P_DEVNUM; + } + + if (devnum != ASCEND710_1P_DEVNUM && devnum != ASCEND710_2P_DEVNUM) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = devnum; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar2_quirk = quirk->data = g_new0(typeof(*bar2_quirk), quirk->nr_mem); + bar2_quirk[0].vdev = vdev; + bar2_quirk[0].offset = ASCEND710_XLOADER_OFFSET; + bar2_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar2_quirk[0], + "vfio-ascend710-bar2-1p-intercept-regs-quirk", + ASCEND710_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar2_quirk[0].offset, + &quirk->mem[0], 1); + + if (devnum == ASCEND710_2P_DEVNUM) { + bar2_quirk[1].vdev = vdev; + bar2_quirk[1].offset = (ASCEND710_2P_BASE + ASCEND710_XLOADER_OFFSET); + bar2_quirk[1].bar = nr; + + memory_region_init_io(&quirk->mem[1], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar2_quirk[1], + "vfio-ascend710-bar2-2p-intercept-regs-quirk", + ASCEND710_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar2_quirk[1].offset, + &quirk->mem[1], 1); + } + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + +static void vfio_probe_ascend310_bar4_quirk(VFIOPCIDevice *vdev, int nr) +{ + VFIOQuirk *quirk; + VFIOAscendBarQuirk *bar4_quirk; + + if (vdev->vendor_id != PCI_VENDOR_ID_HUAWEI || nr != 4 || + vdev->device_id != PCI_DEVICE_ID_ASCEND310) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->nr_mem = 1; + quirk->mem = g_new0(MemoryRegion, quirk->nr_mem); + bar4_quirk = quirk->data = g_new0(typeof(*bar4_quirk), quirk->nr_mem); + bar4_quirk[0].vdev = vdev; + bar4_quirk[0].offset = ASCEND310_XLOADER_OFFSET; + bar4_quirk[0].bar = nr; + + /* + * intercept w/r to the xloader-updating register, + * so the vm can't enable xloader-updating + */ + memory_region_init_io(&quirk->mem[0], OBJECT(vdev), + &vfio_ascend_intercept_regs_quirk, + &bar4_quirk[0], + "vfio-ascend310-bar4-intercept-regs-quirk", + ASCEND310_XLOADER_SIZE); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + bar4_quirk[0].offset, + &quirk->mem[0], 1); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); +} + /* * Common quirk probe entry points. */ @@ -1261,6 +1445,9 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) #ifdef CONFIG_VFIO_IGD vfio_probe_igd_bar4_quirk(vdev, nr); #endif + vfio_probe_ascend910_bar0_quirk(vdev, nr); + vfio_probe_ascend710_bar2_quirk(vdev, nr); + vfio_probe_ascend310_bar4_quirk(vdev, nr); } void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c62c02f7b692c98bba1b931ebb1a4254a7f56061..ce958848b61d0d871a08ce28dc9bc5009870d1a2 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -19,7 +19,9 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include +#include #include #include "hw/hw.h" @@ -42,6 +44,7 @@ #include "qapi/error.h" #include "migration/blocker.h" #include "migration/qemu-file.h" +#include "sysemu/iommufd.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -504,8 +507,9 @@ static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, PCIDevice *pdev) { - kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev); - kvm_irqchip_commit_routes(kvm_state); + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); + kvm_irqchip_update_msi_route(&c, vector->virq, msg, pdev); + kvm_irqchip_commit_route_changes(&c); } static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, @@ -2346,6 +2350,33 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) } + { + HostIOMMUDeviceCaps *caps = &vdev->vbasedev.hiod->caps; + + /* + * TODO: Add option for enabling pasid at a safe offset, this adds the + * pasid capability in the end of the PCIE config space. + */ + if (caps->max_pasid_log2 && pci_device_get_pasid_cap(&vdev->pdev)) { + uint16_t pasid_caps = (caps->max_pasid_log2 << 8) & PCI_PASID_CAP_WIDTH; + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_EXEC) { + pasid_caps |= PCI_PASID_CAP_EXEC; + } + + if (caps->hw_caps & IOMMU_HW_CAP_PCI_PASID_PRIV) { + pasid_caps |= PCI_PASID_CAP_PRIV; + } + + pcie_pasid_init(pdev, + PCIE_CONFIG_SPACE_SIZE - PCI_EXT_CAP_PASID_SIZEOF, + pasid_caps); + + /* PASID capability is fully emulated by QEMU */ + memset(vdev->emulated_config_bits + pdev->exp.pasid_cap, 0xff, 8); + } + } + /* Cleanup chain head ID if necessary */ if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); @@ -2374,7 +2405,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) return 0; } -static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) +void vfio_pci_pre_reset(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; uint16_t cmd; @@ -2411,7 +2442,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } -static void vfio_pci_post_reset(VFIOPCIDevice *vdev) +void vfio_pci_post_reset(VFIOPCIDevice *vdev) { Error *err = NULL; int nr; @@ -2435,7 +2466,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_quirk_reset(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { char tmp[13]; @@ -2445,22 +2476,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) return (strcmp(tmp, name) == 0); } -static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p) { - VFIOGroup *group; struct vfio_pci_hot_reset_info *info; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; + int ret, count; - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); - - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; + assert(info_p && !*info_p); info = g_malloc0(sizeof(*info)); info->argsz = sizeof(*info); @@ -2468,163 +2490,36 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret && errno != ENOSPC) { ret = -errno; + g_free(info); if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); } - goto out_single; + return ret; } count = info->count; - info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); - info->argsz = sizeof(*info) + (count * sizeof(*devices)); - devices = &info->devices[0]; + info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0]))); + info->argsz = sizeof(*info) + (count * sizeof(info->devices[0])); ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret) { ret = -errno; + g_free(info); error_report("vfio: hot reset info failed: %m"); - goto out_single; - } - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } - } - - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } + return ret; } - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); - - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? strerror(errno) : "Success"); - -out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } + *info_p = info; + return 0; +} - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +{ + VFIODevice *vbasedev = &vdev->vbasedev; + const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops; - return ret; + return ops->pci_hot_reset(vbasedev, single); } /* @@ -3076,19 +2971,19 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIODevice *vbasedev = &vdev->vbasedev; - char *tmp, *subsys; Error *err = NULL; - struct stat st; int i, ret; - bool is_mdev; char uuid[UUID_STR_LEN]; char *name; - if (!vbasedev->sysfsdev) { + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || ~vdev->host.slot || ~vdev->host.function)) { error_setg(errp, "No provided host device"); error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); return; } @@ -3098,32 +2993,21 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vdev->host.slot, vdev->host.function); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - vbasedev->ops = &vfio_pci_ops; - vbasedev->type = VFIO_DEVICE_TYPE_PCI; - vbasedev->dev = DEVICE(vdev); - /* * Mediated devices *might* operate compatibly with discarding of RAM, but * we cannot know for certain, it depends on whether the mdev vendor driver * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); - subsys = realpath(tmp, NULL); - g_free(tmp); - is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); - free(subsys); + vbasedev->mdev = vfio_device_is_mdev(vbasedev); - trace_vfio_mdev(vbasedev->name, is_mdev); + trace_vfio_mdev(vbasedev->name, vbasedev->mdev); - if (vbasedev->ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !vbasedev->mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); goto error; @@ -3246,6 +3130,12 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bars_register(vdev); + if (!vbasedev->mdev && + !pci_device_set_iommu_device(pdev, vbasedev->hiod, errp)) { + error_prepend(errp, "Failed to set iommu_device: "); + goto out_teardown; + } + ret = vfio_add_capabilities(vdev, errp); if (ret) { goto out_teardown; @@ -3267,7 +3157,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) error_setg(errp, "cannot support IGD OpRegion feature on hotplugged " "device"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_get_dev_region_info(vbasedev, @@ -3276,13 +3166,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) if (ret) { error_setg_errno(errp, -ret, "does not support requested IGD OpRegion feature"); - goto out_teardown; + goto out_unset_idev; } ret = vfio_pci_igd_opregion_init(vdev, opregion, errp); g_free(opregion); if (ret) { - goto out_teardown; + goto out_unset_idev; } } @@ -3368,6 +3258,10 @@ out_deregister: if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } +out_unset_idev: + if (!vbasedev->mdev) { + pci_device_unset_iommu_device(pdev); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); @@ -3396,6 +3290,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); vfio_unregister_err_notifier(vdev); @@ -3410,7 +3305,8 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); - vfio_migration_exit(&vdev->vbasedev); + vfio_migration_exit(vbasedev); + pci_device_unset_iommu_device(pdev); } static void vfio_pci_reset(DeviceState *dev) @@ -3456,6 +3352,7 @@ static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, "bootindex", NULL, @@ -3465,6 +3362,9 @@ static void vfio_instance_init(Object *obj) vdev->host.slot = ~0U; vdev->host.function = ~0U; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, + DEVICE(vdev), false); + vdev->nv_gpudirect_clique = 0xFF; /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command @@ -3479,6 +3379,9 @@ static Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, ON_OFF_AUTO_ON), + DEFINE_PROP_ON_OFF_AUTO("x-device-dirty-page-tracking", VFIOPCIDevice, + vbasedev.device_dirty_page_tracking, + ON_OFF_AUTO_ON), DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice, display, ON_OFF_AUTO_OFF), DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0), @@ -3517,14 +3420,20 @@ static Property vfio_pci_dev_properties[] = { qdev_prop_nv_gpudirect_clique, uint8_t), DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, OFF_AUTOPCIBAR_OFF), - /* - * TODO - support passed fds... is this necessary? - * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), - * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), - */ +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; +#ifdef CONFIG_IOMMUFD +static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); +} +#endif + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3532,6 +3441,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); +#endif dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index fba8737ab2cb23118c0819f600379773d718ed18..6e64a2654e690af11b72710530a41135b726e96f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -218,6 +218,12 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; +void vfio_pci_pre_reset(VFIOPCIDevice *vdev); +void vfio_pci_post_reset(VFIOPCIDevice *vdev); +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name); +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p); + int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 8e3d4ac45824ec69afb523f8f0e668327122cd02..a8d9b7da633e0717421acbe9a951334b074b6607 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -15,11 +15,13 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include "qapi/error.h" #include #include #include "hw/vfio/vfio-platform.h" +#include "sysemu/iommufd.h" #include "migration/vmstate.h" #include "qemu/error-report.h" #include "qemu/lockable.h" @@ -529,14 +531,13 @@ static VFIODeviceOps vfio_platform_ops = { */ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) { - struct stat st; int ret; - /* @sysfsdev takes precedence over @host */ - if (vbasedev->sysfsdev) { + /* @fd takes precedence over @sysfsdev which takes precedence over @host */ + if (vbasedev->fd < 0 && vbasedev->sysfsdev) { g_free(vbasedev->name); vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); - } else { + } else if (vbasedev->fd < 0) { if (!vbasedev->name || strchr(vbasedev->name, '/')) { error_setg(errp, "wrong host device name"); return -EINVAL; @@ -546,10 +547,9 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp) vbasedev->name); } - if (stat(vbasedev->sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, - "failed to get the sysfs host device file status"); - return -errno; + ret = vfio_device_get_name(vbasedev, errp); + if (ret) { + return ret; } ret = vfio_attach_device(vbasedev->name, vbasedev, @@ -581,10 +581,6 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i, ret; - vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; - vbasedev->dev = dev; - vbasedev->ops = &vfio_platform_ops; - qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? @@ -649,9 +645,29 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPlatformDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; +static void vfio_platform_instance_init(Object *obj) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(obj); + VFIODevice *vbasedev = &vdev->vbasedev; + + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, + DEVICE(vdev), false); +} + +#ifdef CONFIG_IOMMUFD +static void vfio_platform_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PLATFORM_DEVICE(obj)->vbasedev, str, errp); +} +#endif + static void vfio_platform_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -659,6 +675,9 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->realize = vfio_platform_realize; device_class_set_props(dc, vfio_platform_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_platform_set_fd); +#endif dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; @@ -671,6 +690,7 @@ static const TypeInfo vfio_platform_dev_info = { .name = TYPE_VFIO_PLATFORM, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(VFIOPlatformDevice), + .instance_init = vfio_platform_instance_init, .class_init = vfio_platform_class_init, .class_size = sizeof(VFIOPlatformDeviceClass), }; diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 83da2f7ec213dab2acca9b96a1d07a22a49d22c4..697f80d11df7a301cf3d0a6e274a723308e3ea0d 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -24,6 +24,12 @@ #include "qapi/error.h" #include "trace.h" +typedef struct VFIOSpaprContainer { + VFIOContainer container; + MemoryListener prereg_listener; + QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; +} VFIOSpaprContainer; + static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) { if (memory_region_is_iommu(section->mr)) { @@ -44,8 +50,10 @@ static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) static void vfio_prereg_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; + VFIOContainerBase *bcontainer = &container->bcontainer; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -88,9 +96,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ - if (!container->initialized) { - if (!container->error) { - error_setg_errno(&container->error, -ret, + if (!bcontainer->initialized) { + if (!bcontainer->error) { + error_setg_errno(&bcontainer->error, -ret, "Memory registering failed"); } } else { @@ -102,8 +110,9 @@ static void vfio_prereg_listener_region_add(MemoryListener *listener, static void vfio_prereg_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { - VFIOContainer *container = container_of(listener, VFIOContainer, - prereg_listener); + VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, + prereg_listener); + VFIOContainer *container = &scontainer->container; const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; @@ -146,12 +155,12 @@ static const MemoryListener vfio_prereg_listener = { .region_del = vfio_prereg_listener_region_del, }; -static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, +static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova, uint64_t iova_pgsizes) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, min_iova, @@ -165,15 +174,15 @@ static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, hostwin->min_iova = min_iova; hostwin->max_iova = max_iova; hostwin->iova_pgsizes = iova_pgsizes; - QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); + QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); } -static int vfio_host_win_del(VFIOContainer *container, +static int vfio_host_win_del(VFIOSpaprContainer *scontainer, hwaddr min_iova, hwaddr max_iova) { VFIOHostDMAWindow *hostwin; - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); @@ -184,7 +193,7 @@ static int vfio_host_win_del(VFIOContainer *container, return -1; } -static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, +static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, hwaddr iova, hwaddr end) { VFIOHostDMAWindow *hostwin; @@ -226,6 +235,7 @@ static int vfio_spapr_create_window(VFIOContainer *container, hwaddr *pgsize) { int ret = 0; + VFIOContainerBase *bcontainer = &container->bcontainer; IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; unsigned entries, bits_total, bits_per_level, max_levels; @@ -239,13 +249,13 @@ static int vfio_spapr_create_window(VFIOContainer *container, if (pagesize > rampagesize) { pagesize = rampagesize; } - pgmask = container->pgsizes & (pagesize | (pagesize - 1)); + pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; if (!pagesize) { error_report("Host doesn't support page size 0x%"PRIx64 ", the supported mask is 0x%lx", memory_region_iommu_get_min_page_size(iommu_mr), - container->pgsizes); + bcontainer->pgsizes); return -EINVAL; } @@ -313,10 +323,15 @@ static int vfio_spapr_create_window(VFIOContainer *container, return 0; } -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp) +static int +vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin; hwaddr pgsize = 0; int ret; @@ -332,7 +347,7 @@ int vfio_container_add_section_window(VFIOContainer *container, iova = section->offset_within_address_space; end = iova + int128_get64(section->size) - 1; - if (!vfio_find_hostwin(container, iova, end)) { + if (!vfio_find_hostwin(scontainer, iova, end)) { error_setg(errp, "Container %p can't map guest IOVA region" " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end); @@ -346,7 +361,7 @@ int vfio_container_add_section_window(VFIOContainer *container, } /* For now intersections are not allowed, we may relax this later */ - QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { + QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { if (ranges_overlap(hostwin->min_iova, hostwin->max_iova - hostwin->min_iova + 1, section->offset_within_address_space, @@ -368,7 +383,7 @@ int vfio_container_add_section_window(VFIOContainer *container, return ret; } - vfio_host_win_add(container, section->offset_within_address_space, + vfio_host_win_add(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1, pgsize); #ifdef CONFIG_KVM @@ -401,16 +416,22 @@ int vfio_container_add_section_window(VFIOContainer *container, return 0; } -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section) +static void +vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) { + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); + if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { return; } vfio_spapr_remove_window(container, section->offset_within_address_space); - if (vfio_host_win_del(container, + if (vfio_host_win_del(scontainer, section->offset_within_address_space, section->offset_within_address_space + int128_get64(section->size) - 1) < 0) { @@ -419,13 +440,45 @@ void vfio_container_del_section_window(VFIOContainer *container, } } +static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) +{ + VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); + VFIOHostDMAWindow *hostwin, *next; + + if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { + memory_listener_unregister(&scontainer->prereg_listener); + } + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, + next) { + QLIST_REMOVE(hostwin, hostwin_next); + g_free(hostwin); + } +} + +static VFIOIOMMUOps vfio_iommu_spapr_ops; + +static void setup_spapr_ops(VFIOContainerBase *bcontainer) +{ + vfio_iommu_spapr_ops = *bcontainer->ops; + vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window; + vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window; + vfio_iommu_spapr_ops.release = vfio_spapr_container_release; + bcontainer->ops = &vfio_iommu_spapr_ops; +} + int vfio_spapr_container_init(VFIOContainer *container, Error **errp) { + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); struct vfio_iommu_spapr_tce_info info; bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; int ret, fd = container->fd; - QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&scontainer->hostwin_list); /* * The host kernel code implementing VFIO_IOMMU_DISABLE is called @@ -439,13 +492,13 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) return -errno; } } else { - container->prereg_listener = vfio_prereg_listener; + scontainer->prereg_listener = vfio_prereg_listener; - memory_listener_register(&container->prereg_listener, + memory_listener_register(&scontainer->prereg_listener, &address_space_memory); - if (container->error) { + if (bcontainer->error) { ret = -1; - error_propagate_prepend(errp, container->error, + error_propagate_prepend(errp, bcontainer->error, "RAM memory listener initialization failed: "); goto listener_unregister_exit; } @@ -461,7 +514,7 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } if (v2) { - container->pgsizes = info.ddw.pgsizes; + bcontainer->pgsizes = info.ddw.pgsizes; /* * There is a default window in just created container. * To make region_add/del simpler, we better remove this @@ -476,32 +529,56 @@ int vfio_spapr_container_init(VFIOContainer *container, Error **errp) } } else { /* The default table uses 4K pages */ - container->pgsizes = 0x1000; - vfio_host_win_add(container, info.dma32_window_start, + bcontainer->pgsizes = 0x1000; + vfio_host_win_add(scontainer, info.dma32_window_start, info.dma32_window_start + info.dma32_window_size - 1, 0x1000); } + setup_spapr_ops(bcontainer); + return 0; listener_unregister_exit: if (v2) { - memory_listener_unregister(&container->prereg_listener); + memory_listener_unregister(&scontainer->prereg_listener); } return ret; } void vfio_spapr_container_deinit(VFIOContainer *container) { + VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, + container); VFIOHostDMAWindow *hostwin, *next; if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { - memory_listener_unregister(&container->prereg_listener); + memory_listener_unregister(&scontainer->prereg_listener); } - QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, + QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, next) { QLIST_REMOVE(hostwin, hostwin_next); g_free(hostwin); } } + +static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) +{ + VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); + + vioc->add_window = vfio_spapr_container_add_section_window; + vioc->del_window = vfio_spapr_container_del_section_window; + //vioc->release = vfio_spapr_container_release; + //vioc->setup = vfio_spapr_container_setup; +}; + +static const TypeInfo types[] = { + { + .name = TYPE_VFIO_IOMMU_SPAPR, + .parent = TYPE_VFIO_IOMMU_LEGACY, + .class_init = vfio_iommu_spapr_class_init, + }, +}; + +DEFINE_TYPES(types) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 0eb2387cf24c920b0904ec4012b0fcd3f2e8b3cf..8fdde5445697789edeb4c6383566c1b417cc1595 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -116,8 +116,8 @@ vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Re vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x" -vfio_dma_unmap_overflow_workaround(void) "" -vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 +vfio_legacy_dma_unmap_overflow_workaround(void) "" +vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 # platform.c @@ -164,3 +164,14 @@ vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcop vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" vfio_vmstate_change_prepare(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s" + +#iommufd.c + +iommufd_cdev_connect_and_bind(int iommufd, const char *name, int devfd, int devid) " [iommufd=%d] Successfully bound device %s (fd=%d): output devid=%d" +iommufd_cdev_getfd(const char *dev, int devfd) " %s (fd=%d)" +iommufd_cdev_attach_ioas_hwpt(int iommufd, const char *name, int devfd, int id) " [iommufd=%d] Successfully attached device %s (%d) to id=%d" +iommufd_cdev_detach_ioas_hwpt(int iommufd, const char *name) " [iommufd=%d] Successfully detached %s" +iommufd_cdev_fail_attach_existing_container(const char *msg) " %s" +iommufd_cdev_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD container with ioasid=%d" +iommufd_cdev_device_info(char *name, int devfd, int num_irqs, int num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d" +iommufd_cdev_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int dev_id) "\t%04x:%02x:%02x.%x devid %d" diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index c0055a78326c1344d2df79090443bdc4817a5cd0..67291563d369cb5cbe2c1abd170207306cea3c9f 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -5,7 +5,7 @@ system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c') system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) system_virtio_ss.add(when: 'CONFIG_VHOST_VSOCK_COMMON', if_true: files('vhost-vsock-common.c')) system_virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) -system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c')) +system_virtio_ss.add(when: 'CONFIG_VHOST_VDPA_DEV', if_true: files('vdpa-dev.c', 'vdpa-dev-mig.c', 'vdpa-dev-iommufd.c')) specific_virtio_ss = ss.source_set() specific_virtio_ss.add(files('virtio.c')) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 637cac4edf0f3459915d9d840bfc02501e7ca7a5..439f45a569bef1e56fb357900b64bd5ffc7835a2 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -48,11 +48,10 @@ vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRI vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32 vhost_vdpa_reset_device(void *dev) "dev: %p" vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d" -vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: %d" +vhost_vdpa_set_vring_enable_one(void *dev, unsigned i, int enable, int r) "dev: %p, idx: %u, enable: %u, r: %d" vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 -vhost_vdpa_suspend(void *dev) "dev: %p" vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 diff --git a/hw/virtio/vdpa-dev-iommufd.c b/hw/virtio/vdpa-dev-iommufd.c new file mode 100644 index 0000000000000000000000000000000000000000..f5718bae99d431ff60aa2d9ba0ada0a0b6df46de --- /dev/null +++ b/hw/virtio/vdpa-dev-iommufd.c @@ -0,0 +1,483 @@ +/* + * vhost vdpa device iommufd backend + * + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include +#include +#include "qapi/error.h" +#include "exec/target_page.h" +#include "exec/address-spaces.h" +#include "hw/virtio/vdpa-dev-iommufd.h" +#include "migration/migration.h" +#include "qapi/qapi-commands-migration.h" +#include "hw/virtio/vhost.h" + +static QLIST_HEAD(, VDPAIOMMUFDContainer) vdpa_container_list = + QLIST_HEAD_INITIALIZER(vdpa_container_list); + +static int vhost_vdpa_iommufd_container_dma_map(VDPAIOMMUFDContainer *container, hwaddr iova, + hwaddr size, void *vaddr, bool readonly) +{ + return iommufd_backend_map_dma(container->iommufd, container->ioas_id, iova, size, vaddr, readonly); + +} +static int vhost_vdpa_iommufd_container_dma_unmap(VDPAIOMMUFDContainer *container, + hwaddr iova, hwaddr size) +{ + return iommufd_backend_unmap_dma(container->iommufd, container->ioas_id, iova, size); +} + +static void vhost_vdpa_iommufd_container_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + hwaddr iova; + Int128 llend, llsize; + void *vaddr; + int page_size = qemu_target_page_size(); + int page_mask = -page_size; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) != + (section->offset_within_region & ~page_mask))) { + return; + } + + iova = ROUND_UP(section->offset_within_address_space, page_size); + llend = vhost_vdpa_section_end(section, page_mask); + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + memory_region_ref(section->mr); + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + llsize = int128_sub(llend, int128_make64(iova)); + + ret = vhost_vdpa_iommufd_container_dma_map(container, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + qemu_log("vhost vdpa iommufd container dma map failed: %d\n", ret); + } +} + +static void vhost_vdpa_iommufd_container_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + hwaddr iova; + Int128 llend, llsize; + int page_size = qemu_target_page_size(); + int page_mask = -page_size; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, 0, ULLONG_MAX, page_mask)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~page_mask) != + (section->offset_within_region & ~page_mask))) { + return; + } + + iova = ROUND_UP(section->offset_within_address_space, page_size); + llend = vhost_vdpa_section_end(section, page_mask); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + /* + * The unmap ioctl doesn't accept a full 64-bit. need to check it + */ + if (int128_eq(llsize, int128_2_64())) { + llsize = int128_rshift(llsize, 1); + ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize)); + + if (ret) { + qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", " + "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret); + } + iova += int128_get64(llsize); + } + ret = vhost_vdpa_iommufd_container_dma_unmap(container, iova, int128_get64(llsize)); + + if (ret) { + qemu_log("vhost vdpa iommufd container unmap failed(0x%" HWADDR_PRIx ", " + "0x%" HWADDR_PRIx ") = %d (%m)", iova, int128_get64(llsize), ret); + } + + memory_region_unref(section->mr); +} + +static void vhost_vdpa_iommufd_container_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VDPAIOMMUFDContainer *container = container_of(listener, VDPAIOMMUFDContainer, listener); + IOMMUFDHWPT *hwpt; + VhostVdpaDevice *vdev; + MigrationState *ms = migrate_get_current(); + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(vdev, &hwpt->device_list, next) { + if (!vdev->dev.log_enabled || !vdev->dev.log) { + continue; + } + + /** + * For the vhost-vdpa device, log_sync is performed on the entire VM, + * that is, this sync is for the entire flatview. + * Therefore, the first MemoryRegionSection of flatview needs to be + * synchronized. The rest of the mrs do not need to be synchronized. + */ + if (is_first_section(section)) { + int r = vdev->dev.vhost_ops->vhost_log_sync(&vdev->dev); + if (r < 0) { + qemu_log("Failed to sync dirty log: %d\n", r); + if (migration_is_running(ms->state)) { + qmp_migrate_cancel(NULL); + } + return; + } + } + + /** + * Dirty maps are merged separately by MRS, so each MRS needs to be iterated. + */ + if (vhost_bytemap_log_support(&vdev->dev)) { + vhost_sync_dirty_bytemap(&vdev->dev, section); + } else { + vhost_sync_dirty_bitmap(&vdev->dev, section, 0x0, ~0x0ULL); + } + return; + } + } +} + + +/* + * IOTLB API used by vhost vdpa iommufd container + */ +const MemoryListener vhost_vdpa_iommufd_container_listener = { + .name = "vhost-vdpa-iommufd-container", + .region_add = vhost_vdpa_iommufd_container_region_add, + .region_del = vhost_vdpa_iommufd_container_region_del, + .log_sync = vhost_vdpa_iommufd_container_log_sync, +}; + +static int vhost_vdpa_container_connect_iommufd(VDPAIOMMUFDContainer *container) +{ + IOMMUFDBackend *iommufd = container->iommufd; + uint32_t ioas_id; + Error *err = NULL; + + if (!iommufd) { + return -1; + } + + if (iommufd_backend_connect(iommufd, &err)) { + error_report_err(err); + return -1; + } + + if (iommufd_backend_alloc_ioas(iommufd, &ioas_id, &err)) { + error_report_err(err); + iommufd_backend_disconnect(iommufd); + return -1; + } + container->ioas_id = ioas_id; + return 0; +} + +static void vhost_vdpa_container_disconnect_iommufd(VDPAIOMMUFDContainer *container) +{ + IOMMUFDBackend *iommufd = container->iommufd; + uint32_t ioas_id = container->ioas_id; + + if (!iommufd) { + return; + } + + iommufd_backend_free_id(iommufd, ioas_id); + iommufd_backend_disconnect(iommufd); +} + +static IOMMUFDHWPT *vhost_vdpa_find_hwpt(VDPAIOMMUFDContainer *container, + VhostVdpaDevice *vdev) +{ + IOMMUFDHWPT *hwpt = NULL; + VhostVdpaDevice *tmp = NULL; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(tmp, &hwpt->device_list, next) { + if (tmp == vdev) { + return hwpt; + } + } + } + + return NULL; +} + +static VDPAIOMMUFDContainer *vhost_vdpa_find_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + + QLIST_FOREACH(container, &vdpa_container_list, next) { + if (container->iommufd == vdev->iommufd) { + return container; + } + } + + return NULL; +} + +static VDPAIOMMUFDContainer *vhost_vdpa_create_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + + container = g_new0(VDPAIOMMUFDContainer, 1); + container->iommufd = vdev->iommufd; + container->listener = vhost_vdpa_iommufd_container_listener; + QLIST_INIT(&container->hwpt_list); + + QLIST_INSERT_HEAD(&vdpa_container_list, container, next); + + return container; +} + +static void vhost_vdpa_destroy_container(VDPAIOMMUFDContainer *container) +{ + if (!container) { + return; + } + + container->iommufd = NULL; + QLIST_SAFE_REMOVE(container, next); + g_free(container); +} + +static void vhost_vdpa_device_unbind_iommufd(VhostVdpaDevice *vdev) +{ + int ret; + ret = ioctl(vdev->vhostfd, VHOST_VDPA_UNBIND_IOMMUFD, 0); + if (ret) { + qemu_log("vhost vdpa device unbind iommufd failed: %d, devid: %d\n", + ret, vdev->iommufd_devid); + } +} + +static int vhost_vdpa_device_bind_iommufd(VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = vdev->iommufd; + struct vdpa_dev_bind_iommufd bind = { + .iommufd = iommufd->fd, + .out_devid = -1, + }; + int ret; + + /* iommufd auto unbind when vdev->vhostfd close */ + ret = ioctl(vdev->vhostfd, VHOST_VDPA_BIND_IOMMUFD, &bind); + if (ret) { + qemu_log("vhost vdpa device bind iommufd failed: %d\n", ret); + return ret; + } + vdev->iommufd_devid = bind.out_devid; + return 0; +} + +static int vhost_vdpa_container_attach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = NULL; + IOMMUFDHWPT *hwpt = NULL; + Error *err = NULL; + uint32_t pt_id; + int ret; + + if (!container || !container->iommufd || container->iommufd != vdev->iommufd) { + return -1; + } + + iommufd = container->iommufd; + + /* try to find an available hwpt */ + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + pt_id = hwpt->hwpt_id; + ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id); + if (ret == 0) { + QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + vdev->dev.has_container = true; + return 0; + } + } + + /* available hwpt not found in the container, create a new one */ + hwpt = g_new0(IOMMUFDHWPT, 1); + QLIST_INIT(&hwpt->device_list); + + if (!iommufd_backend_alloc_hwpt(iommufd, vdev->iommufd_devid, + container->ioas_id, 0, 0, 0, NULL, + &pt_id, NULL, &err)) { + error_report_err(err); + ret = -1; + goto free_mem; + } + + hwpt->hwpt_id = pt_id; + + ret = ioctl(vdev->vhostfd, VHOST_VDPA_ATTACH_IOMMUFD_PT, &pt_id); + if (ret) { + qemu_log("vhost vdpa device attach iommufd pt failed: %d\n", ret); + goto free_hwpt; + } + + QLIST_INSERT_HEAD(&hwpt->device_list, vdev, next); + vdev->dev.has_container = true; + QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next); + + return 0; + +free_hwpt: + iommufd_backend_free_id(iommufd, hwpt->hwpt_id); +free_mem: + g_free(hwpt); + return ret; +} + +static void vhost_vdpa_container_detach_device(VDPAIOMMUFDContainer *container, VhostVdpaDevice *vdev) +{ + IOMMUFDBackend *iommufd = vdev->iommufd; + IOMMUFDHWPT *hwpt = NULL; + + /* find the hwpt using by this device */ + hwpt = vhost_vdpa_find_hwpt(container, vdev); + if (!hwpt) { + return; + } + + ioctl(vdev->vhostfd, VHOST_VDPA_DETACH_IOMMUFD_PT, &hwpt->hwpt_id); + + QLIST_SAFE_REMOVE(vdev, next); + vdev->dev.has_container = false; + + /* No device using this hwpt, free it */ + if (QLIST_EMPTY(&hwpt->device_list)) { + iommufd_backend_free_id(iommufd, hwpt->hwpt_id); + QLIST_SAFE_REMOVE(hwpt, next); + g_free(hwpt); + } +} + +static int vhost_vdpa_container_get_dev_count(VDPAIOMMUFDContainer *container) +{ + IOMMUFDHWPT *hwpt; + VhostVdpaDevice *dev; + int dev_count = 0; + + QLIST_FOREACH(hwpt, &container->hwpt_list, next) { + QLIST_FOREACH(dev, &hwpt->device_list, next) { + dev_count++; + } + } + + return dev_count; +} + +int vhost_vdpa_attach_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + IOMMUFDBackend *iommufd = vdev->iommufd; + bool new_container = false; + int dev_count = 0; + int ret = 0; + + if (!iommufd) { + return 0; + } + + container = vhost_vdpa_find_container(vdev); + if (!container) { + container = vhost_vdpa_create_container(vdev); + if (!container) { + qemu_log("vdpa create container failed\n"); + return -1; + } + ret = vhost_vdpa_container_connect_iommufd(container); + if (ret) { + qemu_log("vdpa container connect iommufd failed\n"); + goto destroy; + } + new_container = true; + } + + ret = vhost_vdpa_device_bind_iommufd(vdev); + if (ret) { + qemu_log("vdpa device bind iommufd failed\n"); + goto disconnect; + } + + ret = vhost_vdpa_container_attach_device(container, vdev); + if (ret) { + qemu_log("vdpa container attach device failed\n"); + goto unbind; + } + + /* register the container memory listener when attaching the first device */ + dev_count = vhost_vdpa_container_get_dev_count(container); + if (dev_count == 1) { + memory_listener_register(&container->listener, &address_space_memory); + } + + return 0; + +unbind: + vhost_vdpa_device_unbind_iommufd(vdev); +disconnect: + if (!new_container) { + return ret; + } + vhost_vdpa_container_disconnect_iommufd(container); +destroy: + vhost_vdpa_destroy_container(container); + + return ret; +} + +void vhost_vdpa_detach_container(VhostVdpaDevice *vdev) +{ + VDPAIOMMUFDContainer *container = NULL; + IOMMUFDBackend *iommufd = vdev->iommufd; + + if (!iommufd) { + return; + } + + container = vhost_vdpa_find_container(vdev); + if (!container) { + return; + } + + vhost_vdpa_container_detach_device(container, vdev); + + vhost_vdpa_device_unbind_iommufd(vdev); + + if (!QLIST_EMPTY(&container->hwpt_list)) { + return; + } + /* No HWPT in this container, destroy it */ + memory_listener_unregister(&container->listener); + vhost_vdpa_container_disconnect_iommufd(container); + + vhost_vdpa_destroy_container(container); +} diff --git a/hw/virtio/vdpa-dev-mig.c b/hw/virtio/vdpa-dev-mig.c new file mode 100644 index 0000000000000000000000000000000000000000..7de996c83552efe1ccc7edcfd82c584c6ec3e1db --- /dev/null +++ b/hw/virtio/vdpa-dev-mig.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include +#include +#include "qemu/osdep.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vdpa-dev.h" +#include "hw/virtio/virtio-bus.h" +#include "migration/register.h" +#include "migration/migration.h" +#include "migration/misc.h" +#include "qemu/error-report.h" +#include "hw/virtio/vdpa-dev-mig.h" +#include "migration/qemu-file-types.h" +#include "qemu/main-loop.h" + +/* + * Flags used as delimiter: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + */ +#define VDPA_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VDPA_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VDPA_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) + +static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + struct vhost_vdpa *v = dev->opaque; + int fd = v->device_fd; + + if (dev->vhost_ops->backend_type != VHOST_BACKEND_TYPE_VDPA) { + error_report("backend type isn't VDPA. Operation not permitted!\n"); + return -EPERM; + } + + return ioctl(fd, request, arg); +} + +static int vhost_vdpa_set_mig_state(struct vhost_dev *dev, uint8_t state) +{ + return vhost_vdpa_call(dev, VHOST_VDPA_SET_MIG_STATE, &state); +} + +static int vhost_vdpa_dev_buffer_size(struct vhost_dev *dev, uint32_t *size) +{ + return vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER_SIZE, size); +} + +static int vhost_vdpa_dev_buffer_save(struct vhost_dev *dev, QEMUFile *f) +{ + struct vhost_vdpa_config *config; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + uint32_t buffer_size = 0; + int ret; + + ret = vhost_vdpa_dev_buffer_size(dev, &buffer_size); + if (ret) { + error_report("get dev buffer size failed: %d\n", ret); + return ret; + } + + qemu_put_be32(f, buffer_size); + + config = g_malloc(buffer_size + config_size); + config->off = 0; + config->len = buffer_size; + + ret = vhost_vdpa_call(dev, VHOST_GET_DEV_BUFFER, config); + if (ret) { + error_report("get dev buffer failed: %d\n", ret); + goto free; + } + + qemu_put_buffer(f, config->buf, buffer_size); +free: + g_free(config); + + return ret; +} + +static int vhost_vdpa_dev_buffer_load(struct vhost_dev *dev, QEMUFile *f) +{ + struct vhost_vdpa_config *config; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + uint32_t buffer_size, recv_size; + int ret; + + buffer_size = qemu_get_be32(f); + + config = g_malloc(buffer_size + config_size); + config->off = 0; + config->len = buffer_size; + + recv_size = qemu_get_buffer(f, config->buf, buffer_size); + if (recv_size != buffer_size) { + error_report("read dev mig buffer failed, buffer_size: %u, " + "recv_size: %u\n", buffer_size, recv_size); + ret = -EINVAL; + goto free; + } + + ret = vhost_vdpa_call(dev, VHOST_SET_DEV_BUFFER, config); + if (ret) { + error_report("set dev buffer failed: %d\n", ret); + } + +free: + g_free(config); + + return ret; +} + +static int vhost_vdpa_device_suspend(VhostVdpaDevice *vdpa) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); + + if (!vdpa->started || vdpa->suspended) { + return 0; + } + + vdpa->suspended = true; + + return vhost_dev_suspend(&vdpa->dev, vdev, false); +} + +static int vhost_vdpa_device_resume(VhostVdpaDevice *vdpa) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vdpa); + MigrationIncomingState *mis = migration_incoming_get_current(); + int ret; + + if (!vdpa->started || + (!vdpa->suspended && mis->state != RUN_STATE_RESTORE_VM)) { + return 0; + } + + ret = vhost_dev_resume(&vdpa->dev, vdev, false); + if (ret < 0) { + return ret; + } + + vdpa->suspended = false; + return ret; +} + +static void vdpa_dev_migration_handle_incoming_bh(void *opaque) +{ + struct vhost_dev *hdev = opaque; + int ret; + + /* Post start device, unsupport rollback if failed! */ + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_POST_START); + if (ret) { + error_report("Failed to set state: POST_START\n"); + } +} + +static void vdpa_dev_vmstate_change(void *opaque, bool running, RunState state) +{ + VhostVdpaDevice *vdpa = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdpa->dev; + int ret; + MigrationState *ms = migrate_get_current(); + MigrationIncomingState *mis = migration_incoming_get_current(); + + if (!running) { + if (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED) { + ret = vhost_vdpa_device_suspend(vdpa); + if (ret) { + error_report("suspend vdpa device failed: %d\n", ret); + if (ms->migration_thread_running) { + migrate_fd_cancel(ms); + } + } + } + } else { + if (vdpa->suspended) { + ret = vhost_vdpa_device_resume(vdpa); + if (ret) { + error_report("vhost vdpa device resume failed: %d\n", ret); + } + } + + if (mis->state == RUN_STATE_RESTORE_VM) { + ret = vhost_vdpa_call(hdev, VHOST_VDPA_RESUME, NULL); + if (ret) { + error_report("migration dest resume device failed: %d\n", ret); + exit(EXIT_FAILURE); + } + /* post resume */ + mis->bh = qemu_bh_new(vdpa_dev_migration_handle_incoming_bh, + hdev); + qemu_bh_schedule(mis->bh); + } + } +} + +static int vdpa_save_setup(QEMUFile *f, void *opaque) +{ + qemu_put_be64(f, VDPA_MIG_FLAG_DEV_SETUP_STATE); + qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE); + + return qemu_file_get_error(f); +} + +static int vdpa_save_complete_precopy(QEMUFile *f, void *opaque) +{ + VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdev->dev; + int ret; + + qemu_put_be64(f, VDPA_MIG_FLAG_DEV_CONFIG_STATE); + qemu_put_be16(f, (uint16_t)vdev->suspended); + if (vdev->suspended) { + ret = vhost_vdpa_dev_buffer_save(hdev, f); + if (ret) { + error_report("Save vdpa device buffer failed: %d\n", ret); + return ret; + } + } + qemu_put_be64(f, VDPA_MIG_FLAG_END_OF_STATE); + + return qemu_file_get_error(f); +} + +static int vdpa_load_state(QEMUFile *f, void *opaque, int version_id) +{ + VhostVdpaDevice *vdev = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &vdev->dev; + + int ret; + uint64_t data; + uint16_t suspended; + + data = qemu_get_be64(f); + while (data != VDPA_MIG_FLAG_END_OF_STATE) { + if (data == VDPA_MIG_FLAG_DEV_SETUP_STATE) { + data = qemu_get_be64(f); + if (data == VDPA_MIG_FLAG_END_OF_STATE) { + return 0; + } else { + error_report("SETUP STATE: EOS not found 0x%lx\n", data); + return -EINVAL; + } + } else if (data == VDPA_MIG_FLAG_DEV_CONFIG_STATE) { + suspended = qemu_get_be16(f); + if (suspended) { + ret = vhost_vdpa_dev_buffer_load(hdev, f); + if (ret) { + error_report("fail to restore device buffer.\n"); + return ret; + } + } + } + + ret = qemu_file_get_error(f); + if (ret) { + error_report("qemu file error: %d\n", ret); + return ret; + } + data = qemu_get_be64(f); + } + + return 0; +} + +static int vdpa_load_setup(QEMUFile *f, void *opaque) +{ + VhostVdpaDevice *v = VHOST_VDPA_DEVICE(opaque); + struct vhost_dev *hdev = &v->dev; + int ret = 0; + + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_PRE_START); + if (ret) { + error_report("pre start device failed: %d\n", ret); + goto out; + } + + return qemu_file_get_error(f); +out: + return ret; +} + +static SaveVMHandlers savevm_vdpa_handlers = { + .save_setup = vdpa_save_setup, + .save_live_complete_precopy = vdpa_save_complete_precopy, + .load_state = vdpa_load_state, + .load_setup = vdpa_load_setup, +}; + +static void vdpa_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VhostVdpaDevice *vdev = container_of(notifier, + VhostVdpaDevice, + migration_state); + struct vhost_dev *hdev = &vdev->dev; + int ret; + + switch (s->state) { + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: + ret = vhost_vdpa_set_mig_state(hdev, VDPA_DEVICE_CANCEL); + if (ret) { + error_report("Failed to set state CANCEL\n"); + } + + break; + case MIGRATION_STATUS_COMPLETED: + default: + break; + } +} + +void vdpa_migration_register(VhostVdpaDevice *vdev) +{ + vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), + vdpa_dev_vmstate_change, + DEVICE(vdev)); + register_savevm_live("vdpa", -1, 1, + &savevm_vdpa_handlers, DEVICE(vdev)); + vdev->migration_state.notify = vdpa_migration_state_notifier; + migration_add_notifier(&vdev->migration_state, vdpa_migration_state_notifier); +} + +void vdpa_migration_unregister(VhostVdpaDevice *vdev) +{ + migration_remove_notifier(&vdev->migration_state); + unregister_savevm(NULL, "vdpa", DEVICE(vdev)); + qemu_del_vm_change_state_handler(vdev->vmstate); +} diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index f22d5d5bc0ac66e8997037d84c732271d45e0ba0..7ce8547419a4c807df89b4a58dd10c5217754ddc 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -28,6 +28,11 @@ #include "hw/virtio/vdpa-dev.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" +#include "hw/virtio/vdpa-dev-mig.h" +#include "migration/migration.h" +#include "exec/address-spaces.h" +#include "standard-headers/linux/virtio_ids.h" +#include "hw/virtio/vdpa-dev-iommufd.h" static void vhost_vdpa_device_dummy_handle_output(VirtIODevice *vdev, VirtQueue *vq) @@ -106,6 +111,7 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) v->dev.vq_index = 0; v->dev.vq_index_end = v->dev.nvqs; v->dev.backend_features = 0; + v->dev.has_container = false; v->started = false; ret = vhost_vdpa_get_iova_range(v->vhostfd, &iova_range); @@ -123,6 +129,17 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) goto free_vqs; } + /* If the vdpa device is associated with an iommufd, attach device to container */ + if (v->iommufd) { + ret = vhost_vdpa_attach_container(v); + if (ret < 0) { + error_setg(errp, "vhost vdpa device attach container failed: %s", + strerror(-ret)); + goto free_vqs; + } + } else { + memory_listener_register(&v->vdpa.listener, &address_space_memory); + } v->config_size = vhost_vdpa_device_get_u32(v->vhostfd, VHOST_VDPA_GET_CONFIG_SIZE, errp); @@ -154,12 +171,18 @@ static void vhost_vdpa_device_realize(DeviceState *dev, Error **errp) vhost_vdpa_device_dummy_handle_output); } + vdpa_migration_register(v); + return; free_config: g_free(v->config); vhost_cleanup: + memory_listener_unregister(&v->vdpa.listener); vhost_dev_cleanup(&v->dev); + if (v->iommufd) { + vhost_vdpa_detach_container(v); + } free_vqs: g_free(vqs); out: @@ -173,6 +196,7 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev) VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); int i; + vdpa_migration_unregister(s); virtio_set_status(vdev, 0); for (i = 0; i < s->num_queues; i++) { @@ -183,7 +207,11 @@ static void vhost_vdpa_device_unrealize(DeviceState *dev) g_free(s->config); g_free(s->dev.vqs); + memory_listener_unregister(&s->vdpa.listener); vhost_dev_cleanup(&s->dev); + if (s->iommufd) { + vhost_vdpa_detach_container(s); + } qemu_close(s->vhostfd); s->vhostfd = -1; } @@ -192,7 +220,23 @@ static void vhost_vdpa_device_get_config(VirtIODevice *vdev, uint8_t *config) { VhostVdpaDevice *s = VHOST_VDPA_DEVICE(vdev); + uint8_t *new_config; + int ret; + if (s->vdev_id != VIRTIO_ID_BLOCK) { + goto out; + } + + new_config = g_malloc0(s->config_size); + ret = vhost_dev_get_config(&s->dev, new_config, s->config_size, NULL); + if (ret < 0) { + error_report("vhost-vdpa-device: get config failed(%d)\n", ret); + goto free; + } + memcpy(s->config, new_config, s->config_size); +free: + g_free(new_config); +out: memcpy(config, s->config, s->config_size); } @@ -250,14 +294,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; - ret = vhost_dev_start(&s->dev, vdev, false); + ret = vhost_dev_start(&s->dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } - for (i = 0; i < s->dev.nvqs; ++i) { - vhost_vdpa_set_vring_ready(&s->vdpa, i); - } s->started = true; /* @@ -316,7 +357,7 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) should_start = false; } - if (s->started == should_start) { + if (s->started == should_start || s->suspended) { return; } @@ -333,12 +374,13 @@ static void vhost_vdpa_device_set_status(VirtIODevice *vdev, uint8_t status) static Property vhost_vdpa_device_properties[] = { DEFINE_PROP_STRING("vhostdev", VhostVdpaDevice, vhostdev), DEFINE_PROP_UINT16("queue-size", VhostVdpaDevice, queue_size, 0), + DEFINE_PROP_LINK("iommufd", VhostVdpaDevice, iommufd, TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), DEFINE_PROP_END_OF_LIST(), }; static const VMStateDescription vmstate_vhost_vdpa_device = { .name = "vhost-vdpa-device", - .unmigratable = 1, + .unmigratable = 0, .minimum_version_id = 1, .version_id = 1, .fields = (VMStateField[]) { diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index f214df804b27fb29effee67a58e9437f303ff12b..224e7d2379cda51f7a4657dcb64685a1d9306c55 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -28,6 +28,7 @@ #include "sysemu/cryptodev.h" #include "migration/migration.h" #include "migration/postcopy-ram.h" +#include "migration/register.h" #include "trace.h" #include "exec/ramblock.h" @@ -2119,6 +2120,28 @@ static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, return 0; } +static int vhost_user_load_setup(QEMUFile *f, void *opaque) +{ + struct vhost_dev *hdev = opaque; + int r; + + if (hdev->vhost_ops && hdev->vhost_ops->vhost_set_mem_table) { + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); + if (r < 0) { + qemu_log("error: vhost_set_mem_table failed: %s(%d)\n", + strerror(errno), errno); + return r; + } else { + qemu_log("info: vhost_set_mem_table OK\n"); + } + } + return 0; +} + +SaveVMHandlers savevm_vhost_user_handlers = { + .load_setup = vhost_user_load_setup, +}; + static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, Error **errp) { @@ -2126,9 +2149,15 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, struct vhost_user *u; VhostUserState *vus = (VhostUserState *) opaque; int err; + Chardev *chr; assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + chr = qemu_chr_fe_get_driver(((VhostUserState *)opaque)->chr); + if (chr) { + chr->chr_for_flag = CHR_FOR_VHOST_USER; + } + u = g_new0(struct vhost_user, 1); u->user = vus; u->dev = dev; @@ -2255,6 +2284,7 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, u->postcopy_notifier.notify = vhost_user_postcopy_notifier; postcopy_add_notifier(&u->postcopy_notifier); + register_savevm_live("vhost-user", -1, 1, &savevm_vhost_user_handlers, dev); return 0; } @@ -2286,6 +2316,7 @@ static int vhost_user_backend_cleanup(struct vhost_dev *dev) u->region_rb_len = 0; g_free(u); dev->opaque = 0; + unregister_savevm(NULL, "vhost-user", dev); return 0; } diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 819b2d811af4247e52f7fd9f109e844218836d3c..b5fb89b98e15fe871d16b237390aa0ed581022cc 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -26,13 +26,14 @@ #include "qemu/main-loop.h" #include "trace.h" #include "qapi/error.h" +#include "hw/virtio/vdpa-dev-iommufd.h" /* * Return one past the end of the end of section. Be careful with uint64_t * conversions! */ -static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, - int page_mask) +Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, + int page_mask) { Int128 llend = int128_make64(section->offset_within_address_space); llend = int128_add(llend, section->size); @@ -41,10 +42,10 @@ static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, return llend; } -static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, - uint64_t iova_min, - uint64_t iova_max, - int page_mask) +bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, + uint64_t iova_min, + uint64_t iova_max, + int page_mask) { Int128 llend; @@ -596,21 +597,6 @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) return 0; } - /* - * If dev->shadow_vqs_enabled at initialization that means the device has - * been started with x-svq=on, so don't block migration - */ - if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) { - /* We don't have dev->features yet */ - uint64_t features; - ret = vhost_vdpa_get_dev_features(dev, &features); - if (unlikely(ret)) { - error_setg_errno(errp, -ret, "Could not get device features"); - return ret; - } - vhost_svq_valid_features(features, &dev->migration_blocker); - } - /* * Similar to VFIO, we end up pinning all guest memory and have to * disable discarding of RAM. @@ -829,10 +815,11 @@ static int vhost_vdpa_set_features(struct vhost_dev *dev, static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) { uint64_t features; - uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | - 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH | - 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID | - 0x1ULL << VHOST_BACKEND_F_SUSPEND; + uint64_t f = BIT_ULL(VHOST_BACKEND_F_IOTLB_MSG_V2) | + BIT_ULL(VHOST_BACKEND_F_IOTLB_BATCH) | + BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID) | + BIT_ULL(VHOST_BACKEND_F_SUSPEND) | + BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG); int r; if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { @@ -864,13 +851,11 @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev, static int vhost_vdpa_reset_device(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; int ret; uint8_t status = 0; ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); trace_vhost_vdpa_reset_device(dev); - v->suspended = false; return ret; } @@ -882,19 +867,46 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) return idx; } -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, + int enable) { struct vhost_dev *dev = v->dev; struct vhost_vring_state state = { .index = idx, - .num = 1, + .num = enable, }; + hwaddr addr = virtio_queue_get_desc_addr(dev->vdev, idx); + if (addr == 0) { + return 0; + } + int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); - trace_vhost_vdpa_set_vring_ready(dev, idx, r); + trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r); return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ + struct vhost_vdpa *v = dev->opaque; + unsigned int i; + int ret; + + for (i = 0; i < dev->nvqs; ++i) { + ret = vhost_vdpa_set_vring_enable_one(v, i, enable); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +{ + return vhost_vdpa_set_vring_enable_one(v, idx, 1); +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1268,29 +1280,6 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) } } -static void vhost_vdpa_suspend(struct vhost_dev *dev) -{ - struct vhost_vdpa *v = dev->opaque; - int r; - - if (!vhost_vdpa_first_dev(dev)) { - return; - } - - if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) { - trace_vhost_vdpa_suspend(dev); - r = ioctl(v->device_fd, VHOST_VDPA_SUSPEND); - if (unlikely(r)) { - error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno); - } else { - v->suspended = true; - return; - } - } - - vhost_vdpa_reset_device(dev); -} - static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) { struct vhost_vdpa *v = dev->opaque; @@ -1304,7 +1293,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) return -1; } } else { - vhost_vdpa_suspend(dev); vhost_vdpa_svqs_stop(dev); vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); } @@ -1319,8 +1307,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) "IOMMU and try again"); return -1; } - memory_listener_register(&v->listener, dev->vdev->dma_as); - return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); } @@ -1329,8 +1315,6 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) static void vhost_vdpa_reset_status(struct vhost_dev *dev) { - struct vhost_vdpa *v = dev->opaque; - if (dev->vq_index + dev->nvqs != dev->vq_index_end) { return; } @@ -1338,7 +1322,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev *dev) vhost_vdpa_reset_device(dev); vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER); - memory_listener_unregister(&v->listener); } static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, @@ -1354,6 +1337,30 @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); } +static int vhost_vdpa_set_log_fd(struct vhost_dev *dev, int fd, + struct vhost_log *log) +{ + struct vhost_vdpa *v = dev->opaque; + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_SET_LOG_FD, &fd); +} + +static int vhost_vdpa_set_log_size(struct vhost_dev *dev, uint64_t size, + struct vhost_log *log) +{ + struct vhost_vdpa *v = dev->opaque; + uint64_t logsize = size * sizeof(*(log->log)); + + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_SET_LOG_SIZE, &logsize); +} + static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, struct vhost_vring_addr *addr) { @@ -1404,14 +1411,6 @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, return 0; } - if (!v->suspended) { - /* - * Cannot trust in value returned by device, let vhost recover used - * idx from guest. - */ - return -1; - } - ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); return ret; @@ -1488,11 +1487,59 @@ static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) return true; } +static int vhost_vdpa_suspend_device(struct vhost_dev *dev) +{ + int ret; + + vhost_vdpa_svqs_stop(dev); + vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + ret = vhost_vdpa_call(dev, VHOST_VDPA_SUSPEND, NULL); + return ret; +} + +static int vhost_vdpa_resume_device(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + bool ok; + + vhost_vdpa_host_notifiers_init(dev); + ok = vhost_vdpa_svqs_start(dev); + if (unlikely(!ok)) { + return -1; + } + for (int i = 0; i < v->dev->nvqs; ++i) { + vhost_vdpa_set_vring_ready(v, v->dev->vq_index + i); + } + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_VDPA_RESUME, NULL); +} + +static int vhost_vdpa_log_sync(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + return vhost_vdpa_call(dev, VHOST_LOG_SYNC, NULL); +} + const VhostOps vdpa_ops = { .backend_type = VHOST_BACKEND_TYPE_VDPA, .vhost_backend_init = vhost_vdpa_init, .vhost_backend_cleanup = vhost_vdpa_cleanup, .vhost_set_log_base = vhost_vdpa_set_log_base, + .vhost_set_log_size = vhost_vdpa_set_log_size, + .vhost_set_log_fd = vhost_vdpa_set_log_fd, .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, .vhost_set_vring_num = vhost_vdpa_set_vring_num, .vhost_set_vring_base = vhost_vdpa_set_vring_base, @@ -1508,6 +1555,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, + .vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, @@ -1519,6 +1567,9 @@ const VhostOps vdpa_ops = { .vhost_get_device_id = vhost_vdpa_get_device_id, .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, .vhost_force_iommu = vhost_vdpa_force_iommu, + .vhost_log_sync = vhost_vdpa_log_sync, .vhost_set_config_call = vhost_vdpa_set_config_call, .vhost_reset_status = vhost_vdpa_reset_status, + .vhost_dev_suspend = vhost_vdpa_suspend_device, + .vhost_dev_resume = vhost_vdpa_resume_device, }; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2c9ac794680ea9b65eba6cc22e70cf141e90aa73..58207e472b082c3a2c10a139de86a6445239abba 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -26,8 +26,11 @@ #include "hw/mem/memory-device.h" #include "migration/blocker.h" #include "migration/qemu-file-types.h" +#include "migration/migration.h" #include "sysemu/dma.h" #include "trace.h" +#include "qapi/qapi-commands-migration.h" +#include "sysemu/kvm.h" /* enabled until disconnected backend stabilizes */ #define _VHOST_DEBUG 1 @@ -43,6 +46,11 @@ do { } while (0) #endif +bool vhost_bytemap_log_support(struct vhost_dev *dev) +{ + return (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_BYTEMAPLOG)); +} + static struct vhost_log *vhost_log; static struct vhost_log *vhost_log_shm; @@ -55,6 +63,8 @@ static unsigned int used_shared_memslots; static QLIST_HEAD(, vhost_dev) vhost_devices = QLIST_HEAD_INITIALIZER(vhost_devices); +bool used_memslots_exceeded; + unsigned int vhost_get_max_memslots(void) { unsigned int max = UINT_MAX; @@ -149,10 +159,10 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev) } } -static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, - MemoryRegionSection *section, - hwaddr first, - hwaddr last) +int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last) { int i; hwaddr start_addr; @@ -229,12 +239,40 @@ static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, return 0; } +int vhost_sync_dirty_bytemap(struct vhost_dev *dev, + MemoryRegionSection *section) +{ + unsigned long *bytemap = dev->log->log; + return memory_section_set_dirty_bytemap(section, bytemap); +} + static void vhost_log_sync(MemoryListener *listener, MemoryRegionSection *section) { struct vhost_dev *dev = container_of(listener, struct vhost_dev, memory_listener); - vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); + MigrationState *ms = migrate_get_current(); + + if (!dev->log_enabled || !dev->log || dev->has_container) { + return; + } + + if (dev->vhost_ops->vhost_log_sync) { + int r = dev->vhost_ops->vhost_log_sync(dev); + if (r < 0) { + error_report("Failed to sync dirty log: 0x%x\n", r); + if (migration_is_running(ms->state)) { + qmp_migrate_cancel(NULL); + } + return; + } + } + + if (vhost_bytemap_log_support(dev)) { + vhost_sync_dirty_bytemap(dev, section); + } else { + vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); + } } static void vhost_log_sync_range(struct vhost_dev *dev, @@ -244,7 +282,11 @@ static void vhost_log_sync_range(struct vhost_dev *dev, /* FIXME: this is N^2 in number of sections */ for (i = 0; i < dev->n_mem_sections; ++i) { MemoryRegionSection *section = &dev->mem_sections[i]; - vhost_sync_dirty_bitmap(dev, section, first, last); + if (vhost_bytemap_log_support(dev)) { + vhost_sync_dirty_bytemap(dev, section); + } else { + vhost_sync_dirty_bitmap(dev, section, first, last); + } } } @@ -252,11 +294,19 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev) { uint64_t log_size = 0; int i; + uint64_t vhost_log_chunk_size; + + if (vhost_bytemap_log_support(dev)) { + vhost_log_chunk_size = VHOST_LOG_CHUNK_BYTES; + } else { + vhost_log_chunk_size = VHOST_LOG_CHUNK; + } + for (i = 0; i < dev->mem->nregions; ++i) { struct vhost_memory_region *reg = dev->mem->regions + i; uint64_t last = range_get_last(reg->guest_phys_addr, reg->memory_size); - log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); + log_size = MAX(log_size, last / vhost_log_chunk_size + 1); } return log_size; } @@ -374,12 +424,21 @@ static bool vhost_dev_log_is_shared(struct vhost_dev *dev) dev->vhost_ops->vhost_requires_shm_log(dev); } -static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) +static inline int vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) { struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); - uint64_t log_base = (uintptr_t)log->log; + uint64_t log_base; + int log_fd; int r; + if (!log) { + r = -ENOMEM; + goto out; + } + + log_base = (uint64_t)log->log; + log_fd = log_fd; + /* inform backend of log switching, this must be done before releasing the current log, to ensure no logging is lost */ r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); @@ -387,9 +446,19 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); } + if (dev->vhost_ops->vhost_set_log_size) { + r = dev->vhost_ops->vhost_set_log_size(dev, size, dev->log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_size failed"); + } + } + vhost_log_put(dev, true); dev->log = log; dev->log_size = size; + +out: + return r; } static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, @@ -1015,7 +1084,11 @@ static int vhost_migration_log(MemoryListener *listener, bool enable) } vhost_log_put(dev, false); } else { - vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + r = vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + if ( r < 0 ) { + return r; + } + r = vhost_dev_set_log(dev, true); if (r < 0) { goto check_dev_state; @@ -1047,20 +1120,24 @@ check_dev_state: static void vhost_log_global_start(MemoryListener *listener) { int r; + Error *errp = NULL; r = vhost_migration_log(listener, true); if (r < 0) { - abort(); + error_setg(&errp, "Failed to start vhost migration log"); + migrate_fd_error(migrate_get_current(), errp); } } static void vhost_log_global_stop(MemoryListener *listener) { int r; + Error *errp = NULL; r = vhost_migration_log(listener, false); if (r < 0) { - abort(); + error_setg(&errp, "Failed to stop vhost migration log"); + migrate_fd_error(migrate_get_current(), errp); } } @@ -1540,7 +1617,12 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->log_size = 0; hdev->log_enabled = false; hdev->started = false; - memory_listener_register(&hdev->memory_listener, &address_space_memory); + if (virtcca_cvm_enabled() && virtcca_shared_hugepage && virtcca_shared_hugepage->ram_block) { + memory_listener_register(&hdev->memory_listener, + &address_space_virtcca_shared_memory); + } else { + memory_listener_register(&hdev->memory_listener, &address_space_memory); + } QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); /* @@ -1564,8 +1646,11 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, error_setg(errp, "vhost backend memory slots limit (%d) is less" " than current number of used (%d) and reserved (%d)" " memory slots for memory devices.", limit, used, reserved); + used_memslots_exceeded = true; r = -EINVAL; goto fail_busyloop; + } else { + used_memslots_exceeded = false; } return 0; @@ -1984,7 +2069,13 @@ static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); } -/* Host notifiers must be enabled at this point. */ +/* + * Host notifiers must be enabled at this point. + * + * If @vrings is true, this function will enable all vrings before starting the + * device. If it is false, the vring initialization is left to be done by the + * caller. + */ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) { int i, r; @@ -2047,6 +2138,14 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); goto fail_log; } + + if (hdev->vhost_ops->vhost_set_log_size) { + r = hdev->vhost_ops->vhost_set_log_size(hdev, hdev->log_size, hdev->log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_size failed"); + goto fail_log; + } + } } if (vrings) { r = vhost_dev_set_vring_enable(hdev, true); @@ -2400,3 +2499,145 @@ fail: return ret; } + +bool used_memslots_is_exceeded(void) +{ + return used_memslots_exceeded; +} + +int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i, r; + EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; + + /* should only be called after backend is connected */ + if (!hdev->vhost_ops) { + error_report("Missing vhost_ops! Operation not permitted!\n"); + return -EPERM; + } + + vdev->vhost_started = true; + hdev->started = true; + hdev->vdev = vdev; + + if (vhost_dev_has_iommu(hdev)) { + memory_listener_register(&hdev->iommu_listener, vdev->dma_as); + } + + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); + goto fail_mem; + } + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_start(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + if (r < 0) { + goto fail_vq; + } + } + + r = event_notifier_init(e, 0); + if (r < 0) { + return r; + } + event_notifier_test_and_clear(e); + if (!vdev->use_guest_notifier_mask) { + vhost_config_mask(hdev, vdev, true); + } + if (vrings) { + r = vhost_dev_set_vring_enable(hdev, true); + if (r) { + goto fail_vq; + } + } + if (hdev->vhost_ops->vhost_dev_resume) { + r = hdev->vhost_ops->vhost_dev_resume(hdev); + if (r) { + goto fail_start; + } + } + if (vhost_dev_has_iommu(hdev)) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); + + /* + * Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this. + */ + for (i = 0; i < hdev->nvqs; ++i) { + struct vhost_virtqueue *vq = hdev->vqs + i; + vhost_device_iotlb_miss(hdev, vq->used_phys, true); + } + } + vhost_start_config_intr(hdev); + return 0; +fail_start: + if (vrings) { + vhost_dev_set_vring_enable(hdev, false); + } +fail_vq: + while (--i >= 0) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + +fail_mem: + vdev->vhost_started = false; + hdev->started = false; + return r; +} + +int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i; + int ret = 0; + EventNotifier *e = &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier; + + /* should only be called after backend is connected */ + if (!hdev->vhost_ops) { + error_report("Missing vhost_ops! Operation not permitted!\n"); + return -EPERM; + } + + event_notifier_test_and_clear(e); + event_notifier_test_and_clear(&vdev->config_notifier); + + if (hdev->vhost_ops->vhost_dev_suspend) { + ret = hdev->vhost_ops->vhost_dev_suspend(hdev); + if (ret) { + goto fail_suspend; + } + } + if (vrings) { + ret = vhost_dev_set_vring_enable(hdev, false); + if (ret) { + goto fail_suspend; + } + } + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + + if (vhost_dev_has_iommu(hdev)) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); + memory_listener_unregister(&hdev->iommu_listener); + } + vhost_stop_config_intr(hdev); + hdev->started = false; + vdev->vhost_started = false; + hdev->vdev = NULL; + + return ret; + +fail_suspend: + event_notifier_test_and_clear(e); + + return ret; +} diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index 896feb37a1caa805543e971c150d3673675b9a6b..749df6478ea0c112f0123dbe32e9b9a5d685b843 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -25,10 +25,12 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio.h" #include "exec/address-spaces.h" +#include "sysemu/kvm.h" /* #define DEBUG_VIRTIO_BUS */ @@ -70,6 +72,12 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) return; } + if (virtcca_cvm_enabled() && (strcmp(vdev->name, "vhost-user-fs") == 0)) { + /* VIRTIO_F_IOMMU_PLATFORM should be enabled for vhost-user-fs using swiotlb */ + error_setg(errp, "iommu_platform is not supported by this device"); + return; + } + if (klass->device_plugged != NULL) { klass->device_plugged(qbus->parent, &local_err); } @@ -81,6 +89,11 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) vdev->dma_as = &address_space_memory; if (has_iommu) { vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + + if (virtcca_cvm_enabled() && (strcmp(vdev->name, "virtio-user-fs") == 0)) { + vdev_has_iommu = true; + } + /* * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and * device operational. If the driver does not accept IOMMU_PLATFORM diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 0e2cc8d5a890c021c9b900186195f37d093c54f6..4aaced74be120d6e8d35606328f2202c3a2caad6 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -1080,8 +1080,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp) vcrypto->vqs[i].dataq = virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh); vcrypto->vqs[i].dataq_bh = - qemu_bh_new_guarded(virtio_crypto_dataq_bh, &vcrypto->vqs[i], - &dev->mem_reentrancy_guard); + virtio_bh_new_guarded(dev, virtio_crypto_dataq_bh, + &vcrypto->vqs[i]); vcrypto->vqs[i].vcrypto = vcrypto; } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index e4338795423d56b13bd67fa12fc5c808525aef69..13220c258d7c8fa52da3d1334414f754757fc584 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -49,6 +49,90 @@ * configuration space */ #define VIRTIO_PCI_CONFIG_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev)) +static KVMRouteChange virtio_pci_route_change; + +static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, + EventNotifier *n, + unsigned int vector); + +static inline void virtio_pci_begin_route_changes(VirtIODevice *vdev) +{ + if (!vdev->defer_kvm_irq_routing) { + virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state); + } +} + +static inline void virtio_pci_commit_route_changes(VirtIODevice *vdev) +{ + if (!vdev->defer_kvm_irq_routing) { + kvm_irqchip_commit_route_changes(&virtio_pci_route_change); + } +} + +static void virtio_pci_prepare_kvm_msi_virq_batch(VirtIOPCIProxy *proxy) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev->defer_kvm_irq_routing) { + qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing); + return; + } + virtio_pci_route_change = kvm_irqchip_begin_route_changes(kvm_state); + vdev->defer_kvm_irq_routing = true; +} + +static void virtio_pci_commit_kvm_msi_virq_batch(VirtIOPCIProxy *proxy) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + EventNotifier *n; + VirtQueue *vq; + int vector, index, ret; + + if (!vdev->defer_kvm_irq_routing) { + qemu_log("invaild defer kvm irq routing state: %d\n", vdev->defer_kvm_irq_routing); + return; + } + vdev->defer_kvm_irq_routing = false; + kvm_irqchip_commit_route_changes(&virtio_pci_route_change); + + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + return; + } + + for (vector = 0; vector < proxy->pci_dev.msix_entries_nr; vector++) { + if (msix_is_masked(&proxy->pci_dev, vector)) { + continue; + } + + if (vector == vdev->config_vector) { + n = virtio_config_get_guest_notifier(vdev); + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (ret) { + qemu_log("config irqfd use failed: %d\n", ret); + } + continue; + } + + vq = virtio_vector_first_queue(vdev, vector); + + while (vq) { + index = virtio_get_queue_index(vq); + if (!virtio_queue_get_num(vdev, index)) { + break; + } + if (index < proxy->nvqs_with_notifiers) { + n = virtio_queue_get_guest_notifier(vq); + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (ret < 0) { + qemu_log("Error: irqfd use failed: %d\n", ret); + } + } + vq = virtio_vector_next_queue(vq); + } + } +} + static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, VirtIOPCIProxy *dev); static void virtio_pci_reset(DeviceState *qdev); @@ -815,12 +899,10 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, int ret; if (irqfd->users == 0) { - KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); - ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); + ret = kvm_irqchip_add_msi_route(&virtio_pci_route_change, vector, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_route_changes(&c); irqfd->virq = ret; } irqfd->users++; @@ -860,6 +942,9 @@ static int virtio_pci_get_notifier(VirtIOPCIProxy *proxy, int queue_no, VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); VirtQueue *vq; + if (!proxy->vector_irqfd && vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) + return -1; + if (queue_no == VIRTIO_CONFIG_IRQ_IDX) { *n = virtio_config_get_guest_notifier(vdev); *vector = vdev->config_vector; @@ -922,18 +1007,57 @@ undo: } return ret; } + +#ifdef __aarch64__ +int __attribute__((weak)) kvm_create_shadow_device(PCIDevice *dev) +{ + return 0; +} + +int __attribute__((weak)) kvm_delete_shadow_device(PCIDevice *dev) +{ + return 0; +} +#endif + +#ifdef __aarch64__ +static bool shadow_device_supported(VirtIODevice *vdev) +{ + return !strcmp(vdev->name, "virtio-net") || + !strcmp(vdev->name, "virtio-blk") || + !strcmp(vdev->name, "virtio-scsi"); +} +#endif + static int kvm_virtio_pci_vector_vq_use(VirtIOPCIProxy *proxy, int nvqs) { int queue_no; int ret = 0; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); +#ifdef __aarch64__ + if (shadow_device_supported(vdev)) { + kvm_create_shadow_device(&proxy->pci_dev); + } +#endif for (queue_no = 0; queue_no < nvqs; queue_no++) { if (!virtio_queue_get_num(vdev, queue_no)) { return -1; } + } + + virtio_pci_begin_route_changes(vdev); + for (queue_no = 0; queue_no < nvqs; queue_no++) { ret = kvm_virtio_pci_vector_use_one(proxy, queue_no); } + virtio_pci_commit_route_changes(vdev); + +#ifdef __aarch64__ + if (shadow_device_supported(vdev) && ret != 0) { + kvm_delete_shadow_device(&proxy->pci_dev); + } +#endif + return ret; } @@ -976,6 +1100,12 @@ static void kvm_virtio_pci_vector_vq_release(VirtIOPCIProxy *proxy, int nvqs) } kvm_virtio_pci_vector_release_one(proxy, queue_no); } + +#ifdef __aarch64__ + if (shadow_device_supported(vdev)) { + kvm_delete_shadow_device(&proxy->pci_dev); + } +#endif } static void kvm_virtio_pci_vector_config_release(VirtIOPCIProxy *proxy) @@ -997,12 +1127,13 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, + virtio_pci_begin_route_changes(vdev); + ret = kvm_irqchip_update_msi_route(&virtio_pci_route_change, irqfd->virq, msg, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + virtio_pci_commit_route_changes(vdev); } } @@ -1017,7 +1148,9 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, event_notifier_set(n); } } else { - ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + if (!vdev->defer_kvm_irq_routing) { + ret = kvm_virtio_pci_irqfd_use(proxy, n, vector); + } } return ret; } @@ -1274,6 +1407,8 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) if ((with_irqfd || (vdev->use_guest_notifier_mask && k->guest_notifier_mask)) && assign) { + + virtio_pci_prepare_kvm_msi_virq_batch(proxy); if (with_irqfd) { proxy->vector_irqfd = g_malloc0(sizeof(*proxy->vector_irqfd) * @@ -1291,6 +1426,7 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) r = msix_set_vector_notifiers(&proxy->pci_dev, virtio_pci_vector_unmask, virtio_pci_vector_mask, virtio_pci_vector_poll); + virtio_pci_commit_kvm_msi_virq_batch(proxy); if (r < 0) { goto notifiers_error; } @@ -1424,6 +1560,38 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, return offset; } +static void virtio_pci_set_vector(VirtIODevice *vdev, + VirtIOPCIProxy *proxy, + int queue_no, uint16_t old_vector, + uint16_t new_vector) +{ + bool kvm_irqfd = (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) && + msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled(); + + if (new_vector == old_vector) { + return; + } + + /* + * If the device uses irqfd and the vector changes after DRIVER_OK is + * set, we need to release the old vector and set up the new one. + * Otherwise just need to set the new vector on the device. + */ + if (kvm_irqfd && old_vector != VIRTIO_NO_VECTOR) { + kvm_virtio_pci_vector_release_one(proxy, queue_no); + } + /* Set the new vector on the device. */ + if (queue_no == VIRTIO_CONFIG_IRQ_IDX) { + vdev->config_vector = new_vector; + } else { + virtio_queue_set_vector(vdev, queue_no, new_vector); + } + /* If the new vector changed need to set it up. */ + if (kvm_irqfd && new_vector != VIRTIO_NO_VECTOR) { + kvm_virtio_pci_vector_use_one(proxy, queue_no); + } +} + int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, uint8_t bar, uint64_t offset, uint64_t length, uint8_t id) @@ -1570,7 +1738,8 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, } else { val = VIRTIO_NO_VECTOR; } - vdev->config_vector = val; + virtio_pci_set_vector(vdev, proxy, VIRTIO_CONFIG_IRQ_IDX, + vdev->config_vector, val); break; case VIRTIO_PCI_COMMON_STATUS: if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) { @@ -1610,7 +1779,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, } else { val = VIRTIO_NO_VECTOR; } - virtio_queue_set_vector(vdev, vdev->queue_sel, val); + virtio_pci_set_vector(vdev, proxy, vdev->queue_sel, vector, val); break; case VIRTIO_PCI_COMMON_Q_ENABLE: if (val == 1) { @@ -2082,7 +2251,9 @@ static void virtio_pci_device_unplugged(DeviceState *d) VirtIOPCIProxy *proxy = VIRTIO_PCI(d); bool modern = virtio_pci_modern(proxy); bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + qemu_log("unplug device name: %s\n", !vdev ? "NULL" : vdev->name); virtio_pci_stop_ioeventfd(proxy); if (modern) { diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c index e8e3442f38287c6b8ad28ebee16ecac39d933a2c..e542d471622ab50b866122a8388cd3cfc4e8a9aa 100644 --- a/hw/virtio/virtio-scsi-pci.c +++ b/hw/virtio/virtio-scsi-pci.c @@ -20,6 +20,7 @@ #include "qemu/module.h" #include "hw/virtio/virtio-pci.h" #include "qom/object.h" +#include "qemu/log.h" typedef struct VirtIOSCSIPCI VirtIOSCSIPCI; @@ -51,6 +52,8 @@ static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf; char *bus_name; + qemu_log("virtio scsi HBA %s begin to initialize.\n", + !proxy->id ? "NULL" : proxy->id); if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { conf->num_queues = virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED); diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 3a160f86ed02ddaee4a501f4468a20f3ecd696f2..f57b6c955e1c4df0283ffd34589ebaddd80fa03a 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -204,6 +204,15 @@ static const char *virtio_id_to_name(uint16_t device_id) return name; } +static void virtio_check_indirect_feature(VirtIODevice *vdev) +{ + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Device %s: indirect_desc was not negotiated!\n", + vdev->name); + } +} + /* Called within call_rcu(). */ static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) { @@ -322,7 +331,6 @@ static void vring_packed_event_read(VirtIODevice *vdev, /* Make sure flags is seen before off_wrap */ smp_rmb(); e->off_wrap = virtio_lduw_phys_cached(vdev, cache, off_off); - virtio_tswap16s(vdev, &e->flags); } static void vring_packed_off_wrap_write(VirtIODevice *vdev, @@ -744,6 +752,60 @@ int virtio_queue_empty(VirtQueue *vq) } } +static bool virtio_queue_split_poll(VirtQueue *vq, unsigned shadow_idx) +{ + if (unlikely(!vq->vring.avail)) { + return false; + } + + return (uint16_t)shadow_idx != vring_avail_idx(vq); +} + +static bool virtio_queue_packed_poll(VirtQueue *vq, unsigned shadow_idx) +{ + VRingPackedDesc desc; + VRingMemoryRegionCaches *caches; + + if (unlikely(!vq->vring.desc)) { + return false; + } + + caches = vring_get_region_caches(vq); + if (!caches) { + return false; + } + + vring_packed_desc_read(vq->vdev, &desc, &caches->desc, + shadow_idx, true); + + return is_desc_avail(desc.flags, vq->shadow_avail_wrap_counter); +} + +static bool virtio_queue_poll(VirtQueue *vq, unsigned shadow_idx) +{ + if (virtio_device_disabled(vq->vdev)) { + return false; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + return virtio_queue_packed_poll(vq, shadow_idx); + } else { + return virtio_queue_split_poll(vq, shadow_idx); + } +} + +bool virtio_queue_enable_notification_and_check(VirtQueue *vq, + int opaque) +{ + virtio_queue_set_notification(vq, 1); + + if (opaque >= 0) { + return virtio_queue_poll(vq, (unsigned)opaque); + } else { + return false; + } +} + static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len) { @@ -1323,9 +1385,9 @@ err: goto done; } -void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, - unsigned int *out_bytes, - unsigned max_in_bytes, unsigned max_out_bytes) +int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, unsigned max_in_bytes, + unsigned max_out_bytes) { uint16_t desc_size; VRingMemoryRegionCaches *caches; @@ -1358,7 +1420,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, caches); } - return; + return (int)vq->shadow_avail_idx; err: if (in_bytes) { *in_bytes = 0; @@ -1366,6 +1428,8 @@ err: if (out_bytes) { *out_bytes = 0; } + + return -1; } int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, @@ -1559,6 +1623,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) goto done; } + virtio_check_indirect_feature(vdev); + /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, desc.addr, desc.len, false); @@ -1689,6 +1755,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) goto done; } + virtio_check_indirect_feature(vdev); + /* loop over the indirect descriptor table */ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, desc.addr, desc.len, false); @@ -2048,7 +2116,14 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) k->set_status(vdev, val); } vdev->status = val; - + if (val) { + qemu_log("%s device status is %d that means %s\n", + vdev->name, val, + (val & VIRTIO_CONFIG_S_DRIVER_OK) ? "DRIVER OK" : + (val & VIRTIO_CONFIG_S_DRIVER) ? "DRIVER" : + (val & VIRTIO_CONFIG_S_ACKNOWLEDGE) ? "ACKNOWLEDGE" : + (val & VIRTIO_CONFIG_S_FAILED) ? "FAILED" : "UNKNOWN"); + } return 0; } @@ -2189,12 +2264,17 @@ void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, void virtio_queue_set_num(VirtIODevice *vdev, int n, int num) { + int vq_max_size = VIRTQUEUE_MAX_SIZE; + + if (!strcmp(vdev->name, "virtio-net")) { + vq_max_size = VIRTIO_NET_VQ_MAX_SIZE; + } + /* Don't allow guest to flip queue between existent and * nonexistent states, or to set it to an invalid size. */ if (!!num != !!vdev->vq[n].vring.num || - num > VIRTQUEUE_MAX_SIZE || - num < 0) { + num > vq_max_size || num < 0) { return; } vdev->vq[n].vring.num = num; @@ -2326,8 +2406,11 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, break; } - if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) + if (i == VIRTIO_QUEUE_MAX) { + qemu_log("unacceptable queue_size (%d) or num (%d)\n", + queue_size, i); abort(); + } vdev->vq[i].vring.num = queue_size; vdev->vq[i].vring.num_default = queue_size; @@ -2787,6 +2870,35 @@ static const VMStateDescription vmstate_virtio = { } }; +static void check_vring_avail_num(VirtIODevice *vdev, int index) +{ + uint16_t nheads; + VRingMemoryRegionCaches *caches; + + rcu_read_lock(); + caches = qatomic_rcu_read(&vdev->vq[index].vring.caches); + if (caches == NULL) { + /* + * caches may be NULL if virtio_reset is called at the same time, + * such as when the virtual machine starts. + */ + rcu_read_unlock(); + return; + } + + /* Check it isn't doing strange things with descriptor numbers. */ + nheads = vring_avail_idx(&vdev->vq[index]) - vdev->vq[index].last_avail_idx; + if (nheads > vdev->vq[index].vring.num) { + qemu_log("VQ %d size 0x%x Guest index 0x%x " + "inconsistent with Host index 0x%x: " + "delta 0x%x\n", + index, vdev->vq[index].vring.num, + vring_avail_idx(&vdev->vq[index]), + vdev->vq[index].last_avail_idx, nheads); + } + rcu_read_unlock(); +} + int virtio_save(VirtIODevice *vdev, QEMUFile *f) { BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); @@ -2817,6 +2929,8 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f) if (vdev->vq[i].vring.num == 0) break; + check_vring_avail_num(vdev, i); + qemu_put_be32(f, vdev->vq[i].vring.num); if (k->has_variable_vring_alignment) { qemu_put_be32(f, vdev->vq[i].vring.align); @@ -2875,6 +2989,13 @@ static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) { VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); bool bad = (val & ~(vdev->host_features)) != 0; + uint64_t feat = val & ~(vdev->host_features); + + if (bad && k->print_features) { + qemu_log("error: Please check host config, "\ + "because host does not support required feature bits 0x%" PRIx64 "\n", feat); + k->print_features(feat); + } val &= vdev->host_features; if (k->set_features) { @@ -4095,3 +4216,13 @@ static void virtio_register_types(void) } type_init(virtio_register_types) + +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, + QEMUBHFunc *cb, void *opaque, + const char *name) +{ + DeviceState *transport = qdev_get_parent_bus(dev)->parent; + + return qemu_bh_new_full(cb, opaque, name, + &transport->mem_reentrancy_guard); +} diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c index 4973e7d9c95a6909c7fc69b867e933362f6e1e8b..c10b0899143a4377912840a58fa10d0092514e26 100644 --- a/hw/xen/xen-bus.c +++ b/hw/xen/xen-bus.c @@ -352,6 +352,7 @@ static void xen_bus_realize(BusState *bus, Error **errp) error_reportf_err(local_err, "failed to set up '%s' enumeration watch: ", type[i]); + local_err = NULL; } g_free(node); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 4e31d161c57aeb5f51117973be8ff763d0d6548e..a6e2436524f02325670cf136e76eeee11e7431a3 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -57,6 +57,8 @@ #define BLOCK_OPT_DATA_FILE_RAW "data_file_raw" #define BLOCK_OPT_COMPRESSION_TYPE "compression_type" #define BLOCK_OPT_EXTL2 "extended_l2" +#define BLOCK_OPT_CACHE "cache" +#define BLOCK_OPT_BUFFER_SIZE "buffer_size" #define BLOCK_PROBE_BUF_SIZE 512 diff --git a/include/block/nbd.h b/include/block/nbd.h index 4e7bd6342f9cc39378e913cb47969331ca462ee0..d4f8b21aecc379cb0049a28cc2d39039791373e7 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -33,6 +33,19 @@ typedef struct NBDMetaContexts NBDMetaContexts; extern const BlockExportDriver blk_exp_nbd; +/* + * NBD_DEFAULT_HANDSHAKE_MAX_SECS: Number of seconds in which client must + * succeed at NBD_OPT_GO before being forcefully dropped as too slow. + */ +#define NBD_DEFAULT_HANDSHAKE_MAX_SECS 10 + +/* + * NBD_DEFAULT_MAX_CONNECTIONS: Number of client sockets to allow at + * once; must be large enough to allow a MULTI_CONN-aware client like + * nbdcopy to create its typical number of 8-16 sockets. + */ +#define NBD_DEFAULT_MAX_CONNECTIONS 100 + /* Handshake phase structs - this struct is passed on the wire */ typedef struct NBDOption { @@ -403,9 +416,12 @@ AioContext *nbd_export_aio_context(NBDExport *exp); NBDExport *nbd_export_find(const char *name); void nbd_client_new(QIOChannelSocket *sioc, + uint32_t handshake_max_secs, QCryptoTLSCreds *tlscreds, const char *tlsauthz, - void (*close_fn)(NBDClient *, bool)); + void (*close_fn)(NBDClient *, bool), + void *owner); +void *nbd_client_owner(NBDClient *client); void nbd_client_get(NBDClient *client); void nbd_client_put(NBDClient *client); diff --git a/include/chardev/char.h b/include/chardev/char.h index 01df55f9e8c8da5dfd40cafa195cfa606e172fb7..f8bd4694664d20155236f65448840117d9a84f8a 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -14,6 +14,8 @@ #define IAC_SB 250 #define IAC 255 +#define CHR_FOR_VHOST_USER 0x32a1 + /* character device */ typedef struct CharBackend CharBackend; @@ -70,6 +72,7 @@ struct Chardev { GSource *gsource; GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); + int chr_for_flag; }; /** @@ -227,6 +230,16 @@ int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all); #define qemu_chr_write_all(s, buf, len) qemu_chr_write(s, buf, len, true) int qemu_chr_wait_connected(Chardev *chr, Error **errp); +/** + * @qemu_chr_set_reconnect_time: + * + * Set reconnect time for char disconnect. + * Currently, only vhost user will call it. + * + * @reconnect_time the reconnect_time to be set + */ +void qemu_chr_set_reconnect_time(Chardev *chr, int64_t reconnect_time); + #define TYPE_CHARDEV "chardev" OBJECT_DECLARE_TYPE(Chardev, ChardevClass, CHARDEV) @@ -306,6 +319,9 @@ struct ChardevClass { /* handle various events */ void (*chr_be_event)(Chardev *s, QEMUChrEvent event); + + /* set reconnect time */ + void (*chr_set_reconnect_time)(Chardev *chr, int64_t reconnect_time); }; Chardev *qemu_chardev_new(const char *id, const char *typename, diff --git a/include/exec/address-spaces.h b/include/exec/address-spaces.h index 0d0aa61d68966a56e65e35cc6f38687b7e16816f..4518b5da868283e2ed1fc45f68988e5fec9c6e17 100644 --- a/include/exec/address-spaces.h +++ b/include/exec/address-spaces.h @@ -33,6 +33,9 @@ MemoryRegion *get_system_io(void); extern AddressSpace address_space_memory; extern AddressSpace address_space_io; +extern AddressSpace address_space_virtcca_shared_memory; + +extern MemoryRegion *virtcca_shared_hugepage; #endif diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index ba2dd4b5dfc4a0c2d039b3e67718afc5844a6317..2cba27642f765a7ccd6ee33c1d366e2b12e0de81 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -53,8 +53,54 @@ struct ConfidentialGuestSupport { bool ready; }; +/** + * The functions registers with ConfidentialGuestMemoryEncryptionOps will be + * used during the encrypted guest migration. + */ +struct ConfidentialGuestMemoryEncryptionOps { + /* Initialize the platform specific state before starting the migration */ + int (*save_setup)(const char *pdh, const char *plat_cert, + const char *amd_cert); + + /* Write the encrypted page and metadata associated with it */ + int (*save_outgoing_page)(QEMUFile *f, uint8_t *ptr, uint32_t size, + uint64_t *bytes_sent); + + /* Load the incoming encrypted page into guest memory */ + int (*load_incoming_page)(QEMUFile *f, uint8_t *ptr); + + /* Check if gfn is in shared/unencrypted region */ + bool (*is_gfn_in_unshared_region)(unsigned long gfn); + + /* Write the shared regions list */ + int (*save_outgoing_shared_regions_list)(QEMUFile *f, uint64_t *bytes_sent); + + /* Load the shared regions list */ + int (*load_incoming_shared_regions_list)(QEMUFile *f); + + /* Queue the encrypted page and metadata associated with it into a list */ + int (*queue_outgoing_page)(uint8_t *ptr, uint32_t size, uint64_t addr); + + /* Write the list queued with encrypted pages and metadata associated + * with them */ + int (*save_queued_outgoing_pages)(QEMUFile *f, uint64_t *bytes_sent); + + /* Queue the incoming encrypted page into a list */ + int (*queue_incoming_page)(QEMUFile *f, uint8_t *ptr); + + /* Load the incoming encrypted pages queued in list into guest memory */ + int (*load_queued_incoming_pages)(QEMUFile *f); + + /* Write the encrypted cpu state */ + int (*save_outgoing_cpu_state)(QEMUFile *f, uint64_t *bytes_sent); + + /* Load the encrypted cpu state */ + int (*load_incoming_cpu_state)(QEMUFile *f); +}; + typedef struct ConfidentialGuestSupportClass { ObjectClass parent; + struct ConfidentialGuestMemoryEncryptionOps *memory_encryption_ops; } ConfidentialGuestSupportClass; #endif /* !CONFIG_USER_ONLY */ diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 41115d8919407109d83b9e633d0aacfa18dd03ac..d21d9990ad0e548b59bcf846c0dc919d45050f25 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -28,6 +28,7 @@ typedef uint64_t vaddr; void cpu_exec_init_all(void); void cpu_exec_step_atomic(CPUState *cpu); +void virtcca_shared_memory_address_space_init(void); /* Using intptr_t ensures that qemu_*_page_mask is sign-extended even * when intptr_t is 32-bit and we are aligning a long long. @@ -139,6 +140,14 @@ size_t qemu_ram_pagesize_largest(void); */ void cpu_address_space_init(CPUState *cpu, int asidx, const char *prefix, MemoryRegion *mr); +/** + * cpu_address_space_destroy: + * @cpu: CPU for which address space needs to be destroyed + * @asidx: integer index of this address space + * + * Note that with KVM only one address space is supported. + */ +void cpu_address_space_destroy(CPUState *cpu, int asidx); void cpu_physical_memory_rw(hwaddr addr, void *buf, hwaddr len, bool is_write); @@ -157,8 +166,6 @@ void *cpu_physical_memory_map(hwaddr addr, bool is_write); void cpu_physical_memory_unmap(void *buffer, hwaddr len, bool is_write, hwaddr access_len); -void cpu_register_map_client(QEMUBH *bh); -void cpu_unregister_map_client(QEMUBH *bh); bool cpu_physical_memory_is_io(hwaddr phys_addr); diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index d8a3c56fa2b8f10c261d2d46f146a72202e557c3..e2e8dff051dcbbbb269cc3a7631153259453ab9c 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -40,6 +40,12 @@ void gdb_register_coprocessor(CPUState *cpu, gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg, int num_regs, const char *xml, int g_pos); +/** + * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers + * @cpu - the CPU associated with registers + */ +void gdb_unregister_coprocessor_all(CPUState *cpu); + /** * gdbserver_start: start the gdb server * @port_or_device: connection spec for gdb diff --git a/include/exec/memop.h b/include/exec/memop.h index a86dc6743a620f021c0a34370a2affbdad25ae99..5b9064819ca91b4e643a5533833002291bf9498a 100644 --- a/include/exec/memop.h +++ b/include/exec/memop.h @@ -164,10 +164,4 @@ static inline MemOp size_memop(unsigned size) return ctz32(size); } -/* Big endianness from MemOp. */ -static inline bool memop_big_endian(MemOp op) -{ - return (op & MO_BSWAP) == MO_BE; -} - #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 831f7c996d9da49cdf9884fdeffa32959865cb07..0361ec20547bfa5fa9d856a7f2d4bbc23e06443f 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -243,6 +243,20 @@ typedef struct IOMMUTLBEvent { /* RAM FD is opened read-only */ #define RAM_READONLY_FD (1 << 11) +/* The address limit of the VirtCCA bounce buffer is 4GB. */ +#define VIRTCCA_SHARED_HUGEPAGE_ADDR_LIMIT 0x100000000ULL + +/* The VirtCCA shared hugepage memory granularity is 1GB */ +#define VIRTCCA_SHARED_HUGEPAGE_ALIGN 0x40000000ULL + +/* The default GPA starting address of VM is 1GB */ +#define DEFAULT_VM_GPA_START 0x40000000ULL + +/* The GPA starting address of the VirtCCA CVM is 1GB or 3GB */ +extern uint64_t virtcca_cvm_gpa_start; + +extern uint64_t virtcca_cvm_ram_size; + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, @@ -775,6 +789,17 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, ram_addr_t *ram_addr, bool *read_only, bool *mr_has_discard_manager); +typedef struct SharedRegionListener SharedRegionListener; +struct SharedRegionListener { + MemoryListener *listener; + AddressSpace *as; + QTAILQ_ENTRY(SharedRegionListener) next; +}; + +void shared_region_register_listener(SharedRegionListener *shl); +void shared_region_unregister_listener(SharedRegionListener *shl); +void *shared_region_listeners_get(void); + typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; @@ -1054,6 +1079,27 @@ struct MemoryListener { void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section, bool match_data, uint64_t data, EventNotifier *e); + /** + * @eventfd_begin: + * + * Called during an address space begin to update ioeventfd, + * notify kvm that ioeventfd will be update in batches. + * + * @listener: The #MemoryListener. + */ + void (*eventfd_begin)(MemoryListener *listener); + + /** + * @eventfd_end: + * + * Called during an address space update ioeventfd end, + * notify kvm that all ioeventfd modifications have been submitted + * and batch processing can be started. + * + * @listener: The #MemoryListener. + */ + void (*eventfd_end)(MemoryListener *listener); + /** * @coalesced_io_add: * @@ -1106,6 +1152,13 @@ struct MemoryListener { QTAILQ_ENTRY(MemoryListener) link_as; }; +typedef struct AddressSpaceMapClient { + QEMUBH *bh; + QLIST_ENTRY(AddressSpaceMapClient) link; +} AddressSpaceMapClient; + +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096) + /** * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects */ @@ -1114,6 +1167,7 @@ struct AddressSpace { struct rcu_head rcu; char *name; MemoryRegion *root; + bool free_in_rcu; /* Accessed via RCU. */ struct FlatView *current_map; @@ -1123,6 +1177,14 @@ struct AddressSpace { struct MemoryRegionIoeventfd *ioeventfds; QTAILQ_HEAD(, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; + + /* Maximum DMA bounce buffer size used for indirect memory map requests */ + size_t max_bounce_buffer_size; + /* Total size of bounce buffers currently allocated, atomically accessed */ + size_t bounce_buffer_size; + /* List of callbacks to invoke when buffers free up */ + QemuMutex map_client_list_lock; + QLIST_HEAD(, AddressSpaceMapClient) map_client_list; }; typedef struct AddressSpaceDispatch AddressSpaceDispatch; @@ -1139,6 +1201,8 @@ struct FlatView { unsigned nr_allocated; struct AddressSpaceDispatch *dispatch; MemoryRegion *root; + #define FLATVIEW_FLAG_LAST_PROCESSED (1 << 0) + unsigned flags; }; static inline FlatView *address_space_to_flatview(AddressSpace *as) @@ -2044,6 +2108,16 @@ void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client); void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, hwaddr size); +/** + * is_first_section: Determine whether a MemoryRegionSection is the first section + * + * Determine whether a MemoryRegionSection is the first section + * of its corresponding parent MemoryRegion. + * + * @section: MemoryRegionSection + */ +bool is_first_section(MemoryRegionSection *section); + /** * memory_region_clear_dirty_bitmap - clear dirty bitmap for memory range * @@ -2526,6 +2600,11 @@ void memory_region_transaction_begin(void); */ void memory_region_transaction_commit(void); +/** + * memory_region_commit: Force commit memory region immediately. + */ +void memory_region_commit(void); + /** * memory_listener_register: register callbacks to be called when memory * sections are mapped or unmapped into an address @@ -2594,6 +2673,15 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr, MemOp op, MemTxAttrs attrs); +/** + * memory_section_set_dirty_bytemap: Mark a range of bytes as dirty for a memory section + * using a bytemap + * + * @section: the memory section being dirtied. + * @bytemap: bytemap that stores dirty page range information. + */ +int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap); + /** * address_space_init: initializes an address space * @@ -2906,8 +2994,8 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, hwaddr len, * May return %NULL and set *@plen to zero(0), if resources needed to perform * the mapping are exhausted. * Use only for reads OR writes - not for read-modify-write operations. - * Use cpu_register_map_client() to know when retrying the map operation is - * likely to succeed. + * Use address_space_register_map_client() to know when retrying the map + * operation is likely to succeed. * * @as: #AddressSpace to be accessed * @addr: address within that address space @@ -2932,6 +3020,28 @@ void *address_space_map(AddressSpace *as, hwaddr addr, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len); +/* + * address_space_register_map_client: Register a callback to invoke when + * resources for address_space_map() are available again. + * + * address_space_map may fail when there are not enough resources available, + * such as when bounce buffer memory would exceed the limit. The callback can + * be used to retry the address_space_map operation. Note that the callback + * gets automatically removed after firing. + * + * @as: #AddressSpace to be accessed + * @bh: callback to invoke when address_space_map() retry is appropriate + */ +void address_space_register_map_client(AddressSpace *as, QEMUBH *bh); + +/* + * address_space_unregister_map_client: Unregister a callback that has + * previously been registered and not fired yet. + * + * @as: #AddressSpace to be accessed + * @bh: callback to unregister + */ +void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh); /* Internal functions, part of the implementation of address_space_read. */ MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 90676093f5d50c3e4fe6afc55e3ed26587eaf47e..ef6988b445e1734767551f59ff8a36d8cca8b1b3 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -535,5 +535,49 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, return num_dirty; } + +#define BYTES_PER_LONG (sizeof(unsigned long)) +#define BYTE_WORD(nr) ((nr) / BYTES_PER_LONG) +#define BYTES_TO_LONGS(nr) DIV_ROUND_UP(nr, BYTES_PER_LONG) + +static inline int64_t _set_dirty_bytemap_atomic(unsigned long *bytemap, unsigned long cur_pfn) +{ + char *byte_of_long = (char *)bytemap; + int i; + int64_t dirty_num = 0; + + for (i = 0; i < BYTES_PER_LONG; i++) { + if (byte_of_long[i]) { + cpu_physical_memory_set_dirty_range((cur_pfn + i) << TARGET_PAGE_BITS, + TARGET_PAGE_SIZE, + 1 << DIRTY_MEMORY_MIGRATION); + /* Per byte ops, no need to atomic_xchg */ + byte_of_long[i] = 0; + dirty_num++; + } + } + + return dirty_num; +} + +static inline int64_t cpu_physical_memory_set_dirty_bytemap(unsigned long *bytemap, + ram_addr_t start, + ram_addr_t pages) +{ + unsigned long i; + unsigned long len = BYTES_TO_LONGS(pages); + unsigned long pfn = (start >> TARGET_PAGE_BITS) / + BYTES_PER_LONG * BYTES_PER_LONG; + int64_t dirty_mig_bits = 0; + + for (i = 0; i < len; i++) { + if (bytemap[i]) { + dirty_mig_bits += _set_dirty_bytemap_atomic(&bytemap[i], + pfn + BYTES_PER_LONG * i); + } + } + + return dirty_mig_bits; +} #endif #endif diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 69c6a53902939a2de7b38284baff5ed4a86daa63..8f9579ed705b5f6327e0dca9ffd3fda9bdefc725 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,7 +44,7 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; - /* bitmap of already received pages in postcopy */ + /* Bitmap of already received pages. Only used on destination side. */ unsigned long *receivedmap; /* diff --git a/include/exec/ramlist.h b/include/exec/ramlist.h index 2ad2a81accfb6fc4ae4534e009d72be9cc558909..d9cfe530beaf386087d65730cdacf0bd43986edb 100644 --- a/include/exec/ramlist.h +++ b/include/exec/ramlist.h @@ -50,6 +50,7 @@ typedef struct RAMList { /* RCU-enabled, writes protected by the ramlist lock. */ QLIST_HEAD(, RAMBlock) blocks; DirtyMemoryBlocks *dirty_memory[DIRTY_MEMORY_NUM]; + unsigned int num_dirty_blocks; uint32_t version; QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; } RAMList; diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 2b42e4192bf62e01d34d8c148b86eb8b2322187f..7a8b708cdaadb44b850f39a147aa2ba6c70e2fc4 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -90,7 +90,80 @@ typedef struct AcpiFadtData { unsigned *xdsdt_tbl_offset; } AcpiFadtData; +typedef struct AcpiGas { + uint8_t id; /* Address space ID */ + uint8_t width; /* Register bit width */ + uint8_t offset; /* Register bit offset */ + uint8_t size; /* Access size */ + uint64_t addr; /* Address */ +} AcpiGas; + +/* SPCR (Serial Port Console Redirection table) */ +typedef struct AcpiSpcrData { + uint8_t interface_type; + uint8_t reserved[3]; + struct AcpiGas base_addr; + uint8_t interrupt_type; + uint8_t pc_interrupt; + uint32_t interrupt; /* Global system interrupt */ + uint8_t baud_rate; + uint8_t parity; + uint8_t stop_bits; + uint8_t flow_control; + uint8_t terminal_type; + uint8_t language; + uint8_t reserved1; + uint16_t pci_device_id; /* Must be 0xffff if not PCI device */ + uint16_t pci_vendor_id; /* Must be 0xffff if not PCI device */ + uint8_t pci_bus; + uint8_t pci_device; + uint8_t pci_function; + uint32_t pci_flags; + uint8_t pci_segment; + uint32_t reserved2; +} AcpiSpcrData; + #define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0) #define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1) +/* + * CPPC register definition from kernel header + * include/acpi/cppc_acpi.h + * The last element is newly added for easy use + */ +enum cppc_regs { + HIGHEST_PERF, + NOMINAL_PERF, + LOW_NON_LINEAR_PERF, + LOWEST_PERF, + GUARANTEED_PERF, + DESIRED_PERF, + MIN_PERF, + MAX_PERF, + PERF_REDUC_TOLERANCE, + TIME_WINDOW, + CTR_WRAP_TIME, + REFERENCE_CTR, + DELIVERED_CTR, + PERF_LIMITED, + ENABLE, + AUTO_SEL_ENABLE, + AUTO_ACT_WINDOW, + ENERGY_PERF, + REFERENCE_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, + CPPC_REG_COUNT, +}; + +#define CPPC_REG_PER_CPU_STRIDE 0x40 + +/* + * Offset for each CPPC register; -1 for unavailable + * + * Offset for each CPPC register; -1 for unavailable + * The whole register space is unavailable if desired perf offset is -1. + */ +extern int cppc_regs_offset[CPPC_REG_COUNT]; + #endif diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index ff2a310270a44f25821f324479a1390379b9d894..381ad4a8a1782b52b62a0e2c5289bc0e5d780f27 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -382,6 +382,9 @@ Aml *aml_dma(AmlDmaType typ, AmlDmaBusMaster bm, AmlTransferSize sz, uint8_t channel); Aml *aml_sleep(uint64_t msec); Aml *aml_i2c_serial_bus_device(uint16_t address, const char *resource_source); +Aml *aml_generic_register(AmlRegionSpace rs, uint8_t reg_width, + uint8_t reg_offset, AmlAccessType type, + uint64_t addr); /* Block AML object primitives */ Aml *aml_scope(const char *name_format, ...) G_GNUC_PRINTF(1, 2); @@ -412,6 +415,7 @@ Aml *aml_sizeof(Aml *arg); Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target); Aml *aml_object_type(Aml *object); +void build_append_byte(GArray *array, uint8_t val); void build_append_int_noprefix(GArray *table, uint64_t value, int size); typedef struct AcpiTable { @@ -489,6 +493,11 @@ void build_srat_memory(GArray *table_data, uint64_t base, void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id); +void build_processor_hierarchy_node(GArray *tbl, uint32_t flags, + uint32_t parent, uint32_t id, + uint32_t *priv_rsrc, + uint32_t priv_num); + void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms, const char *oem_id, const char *oem_table_id); @@ -497,4 +506,8 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, void build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog, const char *oem_id, const char *oem_table_id); + +void build_spcr(GArray *table_data, BIOSLinker *linker, + const AcpiSpcrData *f, const uint8_t rev, + const char *oem_id, const char *oem_table_id); #endif diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index bc901660fb6fd6f585c553636c1fcb183d2c1276..fa5b5e5f0136e617461593dda2a1c7539b7e16d5 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -18,11 +18,15 @@ #include "hw/boards.h" #include "hw/hotplug.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 + typedef struct AcpiCpuStatus { - struct CPUState *cpu; + CPUState *cpu; uint64_t arch_id; bool is_inserting; bool is_removing; + bool is_present; + bool is_enabled; bool fw_remove; uint32_t ost_event; uint32_t ost_status; @@ -59,10 +63,15 @@ typedef struct CPUHotplugFeatures { typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids, GArray *entry, bool force_enabled); +typedef void (*build_cpu_cppc_fn)(int uid, int num_cpu, Aml *dev); + void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, + build_cpu_cppc_fn build_cpu_cppc, + hwaddr base_addr, const char *res_root, - const char *event_handler_method); + const char *event_handler_method, + AmlRegionSpace rs); void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list); diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h index 3b932abbbbeeba3e5c2dc00b0ae1d99312682925..ef631750b4441e92cbf218f2d40373339202a31c 100644 --- a/include/hw/acpi/cpu_hotplug.h +++ b/include/hw/acpi/cpu_hotplug.h @@ -19,6 +19,10 @@ #include "hw/hotplug.h" #include "hw/acpi/cpu.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 +#define ACPI_CPU_SCAN_METHOD "CSCN" +#define ACPI_CPU_CONTAINER "\\_SB.CPUS" + typedef struct AcpiCpuHotplug { Object *device; MemoryRegion io; diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index ba84ce02147726309c489256f69d7b9be9ef5a7b..d1df3c12e535a4ca0d1e1f63f69aa107d5c7adfa 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -60,8 +60,10 @@ #define HW_ACPI_GENERIC_EVENT_DEVICE_H #include "hw/sysbus.h" +#include "hw/acpi/cpu_hotplug.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/ghes.h" +#include "hw/acpi/cpu.h" #include "qom/object.h" #define ACPI_POWER_BUTTON_DEVICE "PWRB" @@ -80,8 +82,11 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) /* ACPI_GED_REG_RESET value for reset*/ #define ACPI_GED_RESET_VALUE 0x42 -/* ACPI_GED_REG_SLEEP_CTL.SLP_TYP value for S5 (aka poweroff) */ -#define ACPI_GED_SLP_TYP_S5 0x05 +/* [ACPI 5.0 Chapter 4.8.3.7] Sleep Control and Status Register */ +#define ACPI_GED_SLP_TYP_POS 0x2 /* SLP_TYPx Bit Offset */ +#define ACPI_GED_SLP_TYP_MASK 0x07 /* SLP_TYPx 3-bit mask */ +#define ACPI_GED_SLP_TYP_S5 0x05 /* System _S5 State (Soft Off) */ +#define ACPI_GED_SLP_EN 0x20 /* SLP_EN write-only bit */ #define GED_DEVICE "GED" #define AML_GED_EVT_REG "EREG" @@ -95,6 +100,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define ACPI_GED_MEM_HOTPLUG_EVT 0x1 #define ACPI_GED_PWR_DOWN_EVT 0x2 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 +#define ACPI_GED_CPU_HOTPLUG_EVT 0x8 typedef struct GEDState { MemoryRegion evt; @@ -106,6 +112,8 @@ struct AcpiGedState { SysBusDevice parent_obj; MemHotplugState memhp_state; MemoryRegion container_memhp; + CPUHotplugState cpuhp_state; + MemoryRegion container_cpuhp; GEDState ged_state; uint32_t ged_event_bitmap; qemu_irq irq; diff --git a/include/hw/arm/boot.h b/include/hw/arm/boot.h index 80c492d7421e840174acadd71d9e525fa84fcc86..06ca1d90b222483ffb769ab3c83f6056f0ebbb11 100644 --- a/include/hw/arm/boot.h +++ b/include/hw/arm/boot.h @@ -39,6 +39,7 @@ void armv7m_load_kernel(ARMCPU *cpu, const char *kernel_filename, /* arm_boot.c */ struct arm_boot_info { uint64_t ram_size; + void *numa_info; const char *kernel_filename; const char *kernel_cmdline; const char *initrd_filename; @@ -132,6 +133,9 @@ struct arm_boot_info { bool secure_board_setup; arm_endianness endianness; + hwaddr firmware_base; + hwaddr firmware_max_size; + bool confidential; }; /** @@ -178,6 +182,8 @@ AddressSpace *arm_boot_address_space(ARMCPU *cpu, int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, hwaddr addr_limit, AddressSpace *as, MachineState *ms); +void do_cpu_reset(void *opaque); + /* Write a secure board setup routine with a dummy handler for SMCs */ void arm_write_secure_board_setup_dummy_smc(ARMCPU *cpu, const struct arm_boot_info *info, diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h index fd8d772da117060d9bccc9b1ea05a394e40e8421..8ae33c375349f3236a2777ff5159112aeec81b78 100644 --- a/include/hw/arm/smmu-common.h +++ b/include/hw/arm/smmu-common.h @@ -22,6 +22,8 @@ #include "hw/sysbus.h" #include "hw/pci/pci.h" #include "qom/object.h" +#include "sysemu/iommufd.h" +#include #define SMMU_PCI_BUS_MAX 256 #define SMMU_PCI_DEVFN_MAX 256 @@ -49,6 +51,13 @@ typedef enum { SMMU_PTW_ERR_PERMISSION, /* Permission fault */ } SMMUPTWEventType; +/* SMMU Stage */ +typedef enum { + SMMU_STAGE_1 = 1, + SMMU_STAGE_2, + SMMU_NESTED, +} SMMUStage; + typedef struct SMMUPTWEventInfo { int stage; SMMUPTWEventType type; @@ -106,14 +115,73 @@ typedef struct SMMUTransCfg { struct SMMUS2Cfg s2cfg; } SMMUTransCfg; +typedef struct SMMUS2Hwpt { + IOMMUFDBackend *iommufd; + uint32_t hwpt_id; + uint32_t ioas_id; +} SMMUS2Hwpt; + +typedef struct SMMUViommu { + void *smmu; + IOMMUFDBackend *iommufd; + IOMMUFDViommu *core; + SMMUS2Hwpt *s2_hwpt; + uint32_t bypass_hwpt_id; + uint32_t abort_hwpt_id; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; +} SMMUViommu; + +typedef struct SMMUVdev { + SMMUViommu *vsmmu; + IOMMUFDVdev *core; + uint32_t sid; +}SMMUVdev; + +typedef struct PendFaultEntry { + struct iommu_hwpt_pgfault fault; + QTAILQ_ENTRY(PendFaultEntry) entry; +} PendFaultEntry; + +typedef struct PageRespEntry { + struct iommu_hwpt_page_response resp; + QTAILQ_ENTRY(PageRespEntry) entry; +} PageRespEntry; + +typedef struct SMMUS1Hwpt { + void *sdev; + void *smmu; + IOMMUFDBackend *iommufd; + SMMUViommu *viommu; + uint32_t hwpt_id; + uint32_t out_fault_fd; + QLIST_HEAD(, SMMUDevice) device_list; + QLIST_ENTRY(SMMUViommu) next; + /* fault handling */ + struct io_uring fault_ring; + QemuThread read_fault_thread; + QemuThread write_fault_thread; + QemuMutex fault_mutex; + QemuCond fault_cond; + QTAILQ_HEAD(, PageRespEntry) pageresp; + QTAILQ_HEAD(, PendFaultEntry) pendfault; + bool exiting; +} SMMUS1Hwpt; + typedef struct SMMUDevice { void *smmu; PCIBus *bus; int devfn; IOMMUMemoryRegion iommu; + HostIOMMUDeviceIOMMUFD *idev; + SMMUViommu *viommu; + SMMUVdev *vdev; + SMMUS1Hwpt *s1_hwpt; AddressSpace as; + AddressSpace as_sysmem; uint32_t cfg_cache_hits; uint32_t cfg_cache_misses; + struct iommu_hw_info_arm_smmuv3 info; QLIST_ENTRY(SMMUDevice) next; } SMMUDevice; @@ -134,7 +202,13 @@ struct SMMUState { /* */ SysBusDevice dev; const char *mrtypename; + MemoryRegion root; MemoryRegion iomem; + MemoryRegion sysmem; + + /* Nested SMMU */ + bool nested; + SMMUViommu *viommu; GHashTable *smmu_pcibus_by_busptr; GHashTable *configs; /* cache for configuration data */ @@ -181,8 +255,8 @@ int smmu_ptw(SMMUTransCfg *cfg, dma_addr_t iova, IOMMUAccessFlags perm, */ SMMUTransTableInfo *select_tt(SMMUTransCfg *cfg, dma_addr_t iova); -/* Return the iommu mr associated to @sid, or NULL if none */ -IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid); +/* Return the SMMUDevice associated to @sid, or NULL if none */ +SMMUDevice *smmu_find_sdev(SMMUState *s, uint32_t sid); #define SMMU_IOTLB_MAX_SIZE 256 @@ -200,4 +274,15 @@ void smmu_iotlb_inv_iova(SMMUState *s, int asid, int vmid, dma_addr_t iova, /* Unmap the range of all the notifiers registered to any IOMMU mr */ void smmu_inv_notifiers_all(SMMUState *s); +/* IOMMUFD helpers */ +int smmu_dev_get_info(SMMUDevice *sdev, uint32_t *data_type, + uint32_t data_len, uint8_t *pasid, void *data); +void smmu_dev_uninstall_nested_ste(SMMUDevice *sdev, bool abort); +int smmu_dev_install_nested_ste(SMMUDevice *sdev, uint32_t data_type, + uint32_t data_len, void *data, + bool req_fault_fd); +int smmu_hwpt_invalidate_cache(SMMUS1Hwpt *s1_hwpt, uint32_t type, uint32_t len, + uint32_t *num, void *reqs); +int smmu_viommu_invalidate_cache(IOMMUFDViommu *viommu, uint32_t type, + uint32_t len, uint32_t *num, void *reqs); #endif /* HW_ARM_SMMU_COMMON_H */ diff --git a/include/hw/arm/smmuv3.h b/include/hw/arm/smmuv3.h index d183a627669b5164dc5fa9446f4da203498fcb06..79b6fcd8e7b79f60d83ba8dd0d2f7e7e58e11a14 100644 --- a/include/hw/arm/smmuv3.h +++ b/include/hw/arm/smmuv3.h @@ -84,6 +84,23 @@ struct SMMUv3Class { #define TYPE_ARM_SMMUV3 "arm-smmuv3" OBJECT_DECLARE_TYPE(SMMUv3State, SMMUv3Class, ARM_SMMUV3) +#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel" +OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL) + +struct SMMUv3AccelState { + SMMUv3State smmuv3_state; + + char *pci_bus; +}; + +struct SMMUv3AccelClass { + /*< private >*/ + SMMUv3Class smmuv3_class; + /*< public >*/ + + DeviceRealize parent_realize; +}; + #define STAGE1_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S1P) #define STAGE2_SUPPORTED(s) FIELD_EX32(s->idr[0], IDR0, S2P) diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index f69239850e617332ec3ed223edcf4a69019ec973..3e2759d225066c96e9a408dda79e1520639c06f1 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -47,6 +47,71 @@ /* See Linux kernel arch/arm64/include/asm/pvclock-abi.h */ #define PVTIME_SIZE_PER_CPU 64 +/* ARM CLIDR_EL1 related definitions */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CTYPE_NONE 0b000 +#define CTYPE_INS 0b001 +#define CTYPE_DATA 0b010 +#define CTYPE_INS_DATA 0b011 +#define CTYPE_UNIFIED 0b100 + +#define ARM64_REG_CLIDR_EL1 ARM64_SYS_REG(3, 1, 0, 0, 1) + +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +/* L1 data cache */ +#define ARM_L1DCACHE_SIZE 65536 +#define ARM_L1DCACHE_SETS 256 +#define ARM_L1DCACHE_ASSOCIATIVITY 4 +#define ARM_L1DCACHE_ATTRIBUTES 2 +#define ARM_L1DCACHE_LINE_SIZE 64 + +/* L1 instruction cache */ +#define ARM_L1ICACHE_SIZE 65536 +#define ARM_L1ICACHE_SETS 256 +#define ARM_L1ICACHE_ASSOCIATIVITY 4 +#define ARM_L1ICACHE_ATTRIBUTES 4 +#define ARM_L1ICACHE_LINE_SIZE 64 + +/* L1 unified cache */ +#define ARM_L1CACHE_SIZE 131072 +#define ARM_L1CACHE_SETS 256 +#define ARM_L1CACHE_ASSOCIATIVITY 4 +#define ARM_L1CACHE_ATTRIBUTES 10 +#define ARM_L1CACHE_LINE_SIZE 128 + +/* L2 unified cache */ +#define ARM_L2CACHE_SIZE 524288 +#define ARM_L2CACHE_SETS 1024 +#define ARM_L2CACHE_ASSOCIATIVITY 8 +#define ARM_L2CACHE_ATTRIBUTES 10 +#define ARM_L2CACHE_LINE_SIZE 64 + +/* L3 unified cache */ +#define ARM_L3CACHE_SIZE 33554432 +#define ARM_L3CACHE_SETS 2048 +#define ARM_L3CACHE_ASSOCIATIVITY 15 +#define ARM_L3CACHE_ATTRIBUTES 10 +#define ARM_L3CACHE_LINE_SIZE 128 + +/* Definitions of the hardcoded cache info */ +typedef enum { + ARM_L1D_CACHE, + ARM_L1I_CACHE, + ARM_L1_CACHE, + ARM_L2_CACHE, + ARM_L3_CACHE +} ArmCacheType; + +/* MMIO region size for SMMUv3 */ +#define SMMU_IO_LEN 0x20000 + +/* Max supported nested SMMUv3 */ +#define MAX_SMMU_ACCEL 64 + enum { VIRT_FLASH, VIRT_MEM, @@ -59,12 +124,16 @@ enum { VIRT_GIC_ITS, VIRT_GIC_REDIST, VIRT_SMMU, + VIRT_SMMU_ACCEL, VIRT_UART, + VIRT_CPUFREQ, VIRT_MMIO, + VIRT_CVM_MSI, VIRT_RTC, VIRT_FW_CFG, VIRT_PCIE, VIRT_PCIE_MMIO, + VIRT_KAE_DEVICE, VIRT_PCIE_PIO, VIRT_PCIE_ECAM, VIRT_PLATFORM_BUS, @@ -75,6 +144,7 @@ enum { VIRT_PCDIMM_ACPI, VIRT_ACPI_GED, VIRT_NVDIMM_ACPI, + VIRT_CPUHP_ACPI, VIRT_PVTIME, VIRT_LOWMEMMAP_LAST, }; @@ -89,6 +159,7 @@ enum { typedef enum VirtIOMMUType { VIRT_IOMMU_NONE, VIRT_IOMMU_SMMUV3, + VIRT_IOMMU_SMMUV3_ACCEL, VIRT_IOMMU_VIRTIO, } VirtIOMMUType; @@ -138,6 +209,10 @@ struct VirtMachineState { DeviceState *platform_bus_dev; FWCfgState *fw_cfg; PFlashCFI01 *flash[2]; + MemoryRegion *sysmem; + MemoryRegion *secure_sysmem; + MemoryRegion *tag_sysmem; + MemoryRegion *secure_tag_sysmem; bool secure; bool highmem; bool highmem_compact; @@ -147,9 +222,12 @@ struct VirtMachineState { bool its; bool tcg_its; bool virt; + bool cpu_hotplug_enabled; bool ras; bool mte; bool dtb_randomness; + bool pmu; + int smmu_accel_count; OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; @@ -160,6 +238,7 @@ struct VirtMachineState { MemMapEntry *memmap; char *pciehb_nodename; const int *irqmap; + uint16_t boot_cpus; int fdt_size; uint32_t clock_phandle; uint32_t gic_phandle; @@ -173,6 +252,8 @@ struct VirtMachineState { PCIBus *bus; char *oem_id; char *oem_table_id; + char *kvm_type; + NotifierList cpuhp_notifiers; }; #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM) @@ -182,6 +263,7 @@ OBJECT_DECLARE_TYPE(VirtMachineState, VirtMachineClass, VIRT_MACHINE) void virt_acpi_setup(VirtMachineState *vms); bool virt_is_acpi_enabled(VirtMachineState *vms); +bool cpu_l1_cache_unified(int cpu); /* Return number of redistributors that fit in the specified region */ static uint32_t virt_redist_capacity(VirtMachineState *vms, int region) @@ -200,11 +282,23 @@ static uint32_t virt_redist_capacity(VirtMachineState *vms, int region) static inline int virt_gicv3_redist_region_count(VirtMachineState *vms) { uint32_t redist0_capacity = virt_redist_capacity(vms, VIRT_GIC_REDIST); + MachineState *ms = MACHINE(vms); + unsigned int max_cpus = ms->smp.max_cpus; + + if (!vms->cpu_hotplug_enabled) { + max_cpus = ms->smp.cpus; + } assert(vms->gic_version != VIRT_GIC_VERSION_2); - return (MACHINE(vms)->smp.cpus > redist0_capacity && + return (max_cpus > redist0_capacity && vms->highmem_redists) ? 2 : 1; } +static inline bool virt_has_smmuv3(const VirtMachineState *vms) +{ + return vms->iommu == VIRT_IOMMU_SMMUV3 || + vms->iommu == VIRT_IOMMU_SMMUV3_ACCEL; +} + #endif /* QEMU_ARM_VIRT_H */ diff --git a/include/hw/block/block.h b/include/hw/block/block.h index 15fff6643502e51a0896189d12709e1622c2b339..844e87495ad81407f9bf29f78c8612ab3626784a 100644 --- a/include/hw/block/block.h +++ b/include/hw/block/block.h @@ -34,6 +34,8 @@ typedef struct BlockConf { OnOffAuto account_invalid, account_failed; BlockdevOnError rerror; BlockdevOnError werror; + int64_t retry_interval; + int64_t retry_timeout; } BlockConf; static inline unsigned int get_physical_block_exp(BlockConf *conf) @@ -84,7 +86,11 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf) DEFINE_PROP_BLOCKDEV_ON_ERROR("rerror", _state, _conf.rerror, \ BLOCKDEV_ON_ERROR_AUTO), \ DEFINE_PROP_BLOCKDEV_ON_ERROR("werror", _state, _conf.werror, \ - BLOCKDEV_ON_ERROR_AUTO) + BLOCKDEV_ON_ERROR_AUTO), \ + DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL("retry_interval", _state, \ + _conf.retry_interval, 1000), \ + DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT("retry_timeout", _state, \ + _conf.retry_timeout, 0) /* Backend access helpers */ diff --git a/include/hw/boards.h b/include/hw/boards.h index da85f86efb9185df05a444bd22eeec0557a3e3c8..8ac8cad2a2225e97df1475c2f20118ccf2af394e 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -389,6 +389,8 @@ struct MachineState { ram_addr_t ram_size; ram_addr_t maxram_size; + ram_addr_t ram2_base; + ram_addr_t ram2_size; uint64_t ram_slots; BootConfiguration boot_config; char *kernel_filename; diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index c0c8320413e5f66f3c026ec6b4b89374b9cde7b3..37f3a469c897e09f73fc23a1fa999bee7f834238 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -495,7 +495,8 @@ struct CPUState { QemuMutex work_mutex; QSIMPLEQ_HEAD(, qemu_work_item) work_list; - CPUAddressSpace *cpu_ases; + struct CPUAddressSpace *cpu_ases; + int cpu_ases_count; int num_ases; AddressSpace *as; MemoryRegion *memory; @@ -528,6 +529,7 @@ struct CPUState { uint32_t kvm_fetch_index; uint64_t dirty_pages; int kvm_vcpu_stats_fd; + VMChangeStateEntry *vmcse; /* Use by accel-block: CPU is executing an ioctl() */ QemuLockCnt in_ioctl_lock; @@ -538,6 +540,24 @@ struct CPUState { GArray *plugin_mem_cbs; #endif + /* + * Some architectures do not allow *presence* of vCPUs to be changed + * after guest has booted using information specified by VMM/firmware + * via ACPI MADT at the boot time. Thus to enable vCPU hotplug on these + * architectures possible vCPU can have CPUState object in 'disabled' + * state or can also not have CPUState object at all. This is possible + * when vCPU Hotplug is supported and vCPUs are 'yet-to-be-plugged' in + * the QOM or have been hot-unplugged. + * By default every CPUState is enabled as of now across all archs. + */ + bool disabled; + /* + * On certain architectures, to give persistent view of the 'presence' of + * vCPUs to the guest, ACPI might need to fake the 'presence' of the vCPUs + * but keep them ACPI disabled to the guest. This is done by returning + * _STA.PRES=True and _STA.Ena=False for the unplugged vCPUs in QEMU QoM. + */ + bool acpi_persistent; /* TODO Move common fields from CPUArchState here. */ int cpu_index; int cluster_index; @@ -545,6 +565,8 @@ struct CPUState { uint32_t halted; int32_t exception_index; + bool cold_booted; + AccelCPUState *accel; /* shared by kvm and hvf */ bool vcpu_dirty; @@ -913,6 +935,61 @@ static inline bool cpu_in_exclusive_context(const CPUState *cpu) */ CPUState *qemu_get_cpu(int index); +/** + * qemu_get_possible_cpu: + * @index: The CPUState@cpu_index value of the CPU to obtain. + * Input index MUST be in range [0, Max Possible CPUs) + * + * If CPUState object exists,then it gets a CPU matching + * @index in the possible CPU array. + * + * Returns: The possible CPU or %NULL if CPU does not exist. + */ +CPUState *qemu_get_possible_cpu(int index); + +/** + * qemu_present_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU is amongst the present possible vcpus. + * + * Returns: True if it is present possible vCPU else false + */ +bool qemu_present_cpu(CPUState *cpu); + +/** + * qemu_enabled_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU is enabled. + * + * Returns: True if it is 'enabled' else false + */ +bool qemu_enabled_cpu(CPUState *cpu); + +/** + * qemu_persistent_cpu: + * @cpu: The vCPU to check + * + * Checks if the vCPU state should always be reflected as *present* via ACPI + * to the Guest. By default, this is False on all architectures and has to be + * explicity set during initialization. + * + * Returns: True if it is ACPI 'persistent' CPU + * + */ +bool qemu_persistent_cpu(CPUState *cpu); + +/** + * qemu_get_cpu_archid: + * @cpu_index: possible vCPU for which arch-id needs to be retreived + * + * Fetches the vCPU arch-id from the present possible vCPUs. + * + * Returns: arch-id of the possible vCPU + */ +uint64_t qemu_get_cpu_archid(int cpu_index); + /** * cpu_exists: * @id: Guest-exposed CPU ID to lookup. diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h index 0a5c258fe6898c58b5b7aa03dee4286e70c20541..9c35d1b9da6c36198b6f49e99db6eb49cdfae058 100644 --- a/include/hw/elf_ops.h +++ b/include/hw/elf_ops.h @@ -500,7 +500,7 @@ static ssize_t glue(load_elf, SZ)(const char *name, int fd, } if (data_swab) { - int j; + elf_word j; for (j = 0; j < file_size; j += (1 << data_swab)) { uint8_t *dp = data + j; switch (data_swab) { diff --git a/include/hw/firmware/smbios.h b/include/hw/firmware/smbios.h index 7f3259a6300af0d7b8a359b879f5dcb99cf4d485..6e514982d49c40fef516fe1220aaed7f31062435 100644 --- a/include/hw/firmware/smbios.h +++ b/include/hw/firmware/smbios.h @@ -295,6 +295,7 @@ void smbios_set_cpuid(uint32_t version, uint32_t features); void smbios_set_defaults(const char *manufacturer, const char *product, const char *version, bool legacy_mode, bool uuid_encoded, SmbiosEntryPointType ep_type); +void smbios_set_default_processor_family(uint16_t processor_family); uint8_t *smbios_get_table_legacy(MachineState *ms, size_t *length); void smbios_get_tables(MachineState *ms, const struct smbios_phys_mem_area *mem_array, diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 7fa0a695c87bb8569fd6985e299fc0a1cc4b0c0c..1eb05c29fc9c703a61f06d90616694e74fb61c15 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -292,6 +292,8 @@ struct IntelIOMMUState { /* list of registered notifiers */ QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers; + GHashTable *vtd_host_iommu_dev; /* HostIOMMUDevice */ + /* interrupt remapping */ bool intr_enabled; /* Whether guest enabled IR */ dma_addr_t intr_root; /* Interrupt remapping table pointer */ diff --git a/include/hw/intc/arm_gicv3_common.h b/include/hw/intc/arm_gicv3_common.h index 4e2fb518e7240074465b18aab9ea96d5752146a9..b5f8ba17ff30c80b70eaa1e2e31060970f437b9b 100644 --- a/include/hw/intc/arm_gicv3_common.h +++ b/include/hw/intc/arm_gicv3_common.h @@ -280,6 +280,7 @@ struct GICv3State { GICv3CPUState *gicd_irouter_target[GICV3_MAXIRQ]; uint32_t gicd_nsacr[DIV_ROUND_UP(GICV3_MAXIRQ, 16)]; + Notifier cpu_update_notifier; GICv3CPUState *cpu; /* List of all ITSes connected to this GIC */ GPtrArray *itslist; @@ -324,10 +325,32 @@ struct ARMGICv3CommonClass { void (*pre_save)(GICv3State *s); void (*post_load)(GICv3State *s); + void (*init_cpu_reginfo)(CPUState *cs); }; void gicv3_init_irqs_and_mmio(GICv3State *s, qemu_irq_handler handler, const MemoryRegionOps *ops); +/** + * Structure used by GICv3 CPU hotplug notifier + */ +typedef struct GICv3CPUHotplugInfo { + DeviceState *gic; /* GICv3State */ + CPUState *cpu; +} GICv3CPUHotplugInfo; + +/** + * gicv3_cpuhp_notifier + * + * Returns CPU hotplug notifier which could be used to update GIC about any + * CPU hot(un)plug events. + * + * Returns: Notifier initialized with CPU Hot(un)plug update function + */ +static inline Notifier *gicv3_cpuhp_notifier(DeviceState *dev) +{ + GICv3State *s = ARM_GICV3_COMMON(dev); + return &s->cpu_update_notifier; +} /** * gicv3_class_name diff --git a/include/hw/intc/loongarch_extioi.h b/include/hw/intc/loongarch_extioi.h index fbdef9a7b3b74ec70cd12878b5a2680d90d4a651..92b38d5c38bea88f7b7c107199b910201d72281d 100644 --- a/include/hw/intc/loongarch_extioi.h +++ b/include/hw/intc/loongarch_extioi.h @@ -15,7 +15,7 @@ #define EXTIOI_IRQS (256) #define EXTIOI_IRQS_BITMAP_SIZE (256 / 8) /* irq from EXTIOI is routed to no more than 4 cpus */ -#define EXTIOI_CPUS (4) +#define EXTIOI_CPUS (256) /* map to ipnum per 32 irqs */ #define EXTIOI_IRQS_IPMAP_SIZE (256 / 32) #define EXTIOI_IRQS_COREMAP_SIZE 256 @@ -39,26 +39,88 @@ #define EXTIOI_COREISR_END (0xB20 - APIC_OFFSET) #define EXTIOI_COREMAP_START (0xC00 - APIC_OFFSET) #define EXTIOI_COREMAP_END (0xD00 - APIC_OFFSET) +#define EXTIOI_SIZE 0x800 + +#define EXTIOI_VIRT_BASE (0x40000000) +#define EXTIOI_VIRT_SIZE (0x1000) +#define EXTIOI_VIRT_FEATURES (0x0) +#define EXTIOI_HAS_VIRT_EXTENSION (0) +#define EXTIOI_HAS_ENABLE_OPTION (1) +#define EXTIOI_HAS_INT_ENCODE (2) +#define EXTIOI_HAS_CPU_ENCODE (3) +#define EXTIOI_VIRT_HAS_FEATURES (BIT(EXTIOI_HAS_VIRT_EXTENSION) \ + | BIT(EXTIOI_HAS_ENABLE_OPTION)\ + | BIT(EXTIOI_HAS_INT_ENCODE) \ + | BIT(EXTIOI_HAS_CPU_ENCODE)) +#define EXTIOI_VIRT_CONFIG (0x4) +#define EXTIOI_ENABLE (1) +#define EXTIOI_ENABLE_INT_ENCODE (2) +#define EXTIOI_ENABLE_CPU_ENCODE (3) +#define EXTIOI_VIRT_COREMAP_START (0x40) +#define EXTIOI_VIRT_COREMAP_END (0x240) + +#define EXTIOI_SW_COREMAP_FLAG (1 << 0) + +typedef struct ExtIOICore { + uint32_t coreisr[EXTIOI_IRQS_GROUP_COUNT]; + DECLARE_BITMAP(sw_isr[LS3A_INTC_IP], EXTIOI_IRQS); + qemu_irq parent_irq[LS3A_INTC_IP]; +} ExtIOICore; + +#define TYPE_LOONGARCH_EXTIOI "loongarch-extioi" +#define TYPE_KVM_LOONGARCH_EXTIOI "loongarch-kvm-extioi" -#define TYPE_LOONGARCH_EXTIOI "loongarch.extioi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchExtIOI, LOONGARCH_EXTIOI) struct LoongArchExtIOI { SysBusDevice parent_obj; + uint32_t num_cpu; + uint32_t features; + uint32_t status; /* hardware state */ uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; uint32_t isr[EXTIOI_IRQS / 32]; - uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT]; uint32_t enable[EXTIOI_IRQS / 32]; uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4]; uint32_t coremap[EXTIOI_IRQS / 4]; uint32_t sw_pending[EXTIOI_IRQS / 32]; - DECLARE_BITMAP(sw_isr[EXTIOI_CPUS][LS3A_INTC_IP], EXTIOI_IRQS); uint8_t sw_ipmap[EXTIOI_IRQS_IPMAP_SIZE]; uint8_t sw_coremap[EXTIOI_IRQS]; - qemu_irq parent_irq[EXTIOI_CPUS][LS3A_INTC_IP]; qemu_irq irq[EXTIOI_IRQS]; - MemoryRegion extioi_iocsr_mem[EXTIOI_CPUS]; + ExtIOICore *cpu; MemoryRegion extioi_system_mem; + MemoryRegion virt_extend; +}; + +struct KVMLoongArchExtIOI { + SysBusDevice parent_obj; + uint32_t num_cpu; + uint32_t features; + uint32_t status; + + /* hardware state */ + uint32_t nodetype[EXTIOI_IRQS_NODETYPE_COUNT / 2]; + uint32_t bounce[EXTIOI_IRQS_GROUP_COUNT]; + uint32_t isr[EXTIOI_IRQS / 32]; + uint32_t coreisr[EXTIOI_CPUS][EXTIOI_IRQS_GROUP_COUNT]; + uint32_t enable[EXTIOI_IRQS / 32]; + uint32_t ipmap[EXTIOI_IRQS_IPMAP_SIZE / 4]; + uint32_t coremap[EXTIOI_IRQS / 4]; + uint8_t sw_coremap[EXTIOI_IRQS]; }; +typedef struct KVMLoongArchExtIOI KVMLoongArchExtIOI; +DECLARE_INSTANCE_CHECKER(KVMLoongArchExtIOI, KVM_LOONGARCH_EXTIOI, + TYPE_KVM_LOONGARCH_EXTIOI) + +struct KVMLoongArchExtIOIClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; +}; +typedef struct KVMLoongArchExtIOIClass KVMLoongArchExtIOIClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchExtIOIClass, KVM_LOONGARCH_EXTIOI, + TYPE_KVM_LOONGARCH_EXTIOI) + #endif /* LOONGARCH_EXTIOI_H */ diff --git a/include/hw/intc/loongarch_ipi.h b/include/hw/intc/loongarch_ipi.h index 6c6194786e807317a3aa800e94e36ee1bef4652d..601b4f18a7b835457ad12578d836c001ca23797b 100644 --- a/include/hw/intc/loongarch_ipi.h +++ b/include/hw/intc/loongarch_ipi.h @@ -32,6 +32,7 @@ #define TYPE_LOONGARCH_IPI "loongarch_ipi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchIPI, LOONGARCH_IPI) +#define TYPE_KVM_LOONGARCH_IPI "loongarch-ipi-kvm" typedef struct IPICore { uint32_t status; @@ -47,7 +48,30 @@ struct LoongArchIPI { SysBusDevice parent_obj; MemoryRegion ipi_iocsr_mem; MemoryRegion ipi64_iocsr_mem; - IPICore ipi_core; + uint32_t num_cpu; + IPICore *cpu; }; +struct KVMLoongArchIPI { + SysBusDevice parent_obj; + uint32_t num_cpu; + IPICore *cpu; +}; +typedef struct KVMLoongArchIPI KVMLoongArchIPI; +DECLARE_INSTANCE_CHECKER(KVMLoongArchIPI, KVM_LOONGARCH_IPI, + TYPE_KVM_LOONGARCH_IPI) + +struct KVMLoongArchIPIClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; + +}; +typedef struct KVMLoongArchIPIClass KVMLoongArchIPIClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchIPIClass, KVM_LOONGARCH_IPI, + TYPE_KVM_LOONGARCH_IPI) + + #endif diff --git a/include/hw/intc/loongarch_pch_msi.h b/include/hw/intc/loongarch_pch_msi.h index b8586fb3b6f6b170915d4b126ad4d562dec1890a..fd4ea97a83b3e7032efec0126179e5f0eb73ba2a 100644 --- a/include/hw/intc/loongarch_pch_msi.h +++ b/include/hw/intc/loongarch_pch_msi.h @@ -7,7 +7,7 @@ #include "hw/sysbus.h" -#define TYPE_LOONGARCH_PCH_MSI "loongarch_pch_msi" +#define TYPE_LOONGARCH_PCH_MSI "loongarch_pch_msi" OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHMSI, LOONGARCH_PCH_MSI) /* MSI irq start from 32 to 255 */ diff --git a/include/hw/intc/loongarch_pch_pic.h b/include/hw/intc/loongarch_pch_pic.h index d5437e88f289880b1ab4d93f143e32eb4608c743..77f4cd74a129755a3f580b742655a1f6e915f26e 100644 --- a/include/hw/intc/loongarch_pch_pic.h +++ b/include/hw/intc/loongarch_pch_pic.h @@ -7,7 +7,8 @@ #include "hw/sysbus.h" -#define TYPE_LOONGARCH_PCH_PIC "loongarch_pch_pic" +#define TYPE_LOONGARCH_PCH_PIC "loongarch_pch_pic" +#define TYPE_KVM_LOONGARCH_PCH_PIC "loongarch_kvm_pch_pic" #define PCH_PIC_NAME(name) TYPE_LOONGARCH_PCH_PIC#name OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC) @@ -37,6 +38,19 @@ OBJECT_DECLARE_SIMPLE_TYPE(LoongArchPCHPIC, LOONGARCH_PCH_PIC) #define PCH_PIC_INT_POL_LO 0x3e0 #define PCH_PIC_INT_POL_HI 0x3e4 +#define PCH_PIC_INT_ID_START PCH_PIC_INT_ID_LO +#define PCH_PIC_MASK_START PCH_PIC_INT_MASK_LO +#define PCH_PIC_HTMSI_EN_START PCH_PIC_HTMSI_EN_LO +#define PCH_PIC_EDGE_START PCH_PIC_INT_EDGE_LO +#define PCH_PIC_CLEAR_START PCH_PIC_INT_CLEAR_LO +#define PCH_PIC_AUTO_CTRL0_START PCH_PIC_AUTO_CTRL0_LO +#define PCH_PIC_AUTO_CTRL1_START PCH_PIC_AUTO_CTRL1_LO +#define PCH_PIC_ROUTE_ENTRY_START PCH_PIC_ROUTE_ENTRY_OFFSET +#define PCH_PIC_HTMSI_VEC_START PCH_PIC_HTMSI_VEC_OFFSET +#define PCH_PIC_INT_IRR_START 0x380 +#define PCH_PIC_INT_ISR_START PCH_PIC_INT_STATUS_LO +#define PCH_PIC_POLARITY_START PCH_PIC_INT_POL_LO + #define STATUS_LO_START 0 #define STATUS_HI_START 0x4 #define POL_LO_START 0x40 @@ -67,3 +81,38 @@ struct LoongArchPCHPIC { MemoryRegion iomem8; unsigned int irq_num; }; + +struct KVMLoongArchPCHPIC { + SysBusDevice parent_obj; + uint64_t int_mask; /*0x020 interrupt mask register*/ + uint64_t htmsi_en; /*0x040 1=msi*/ + uint64_t intedge; /*0x060 edge=1 level =0*/ + uint64_t intclr; /*0x080 for clean edge int,set 1 clean,set 0 is noused*/ + uint64_t auto_crtl0; /*0x0c0*/ + uint64_t auto_crtl1; /*0x0e0*/ + uint64_t last_intirr; /* edge detection */ + uint64_t intirr; /* 0x380 interrupt request register */ + uint64_t intisr; /* 0x3a0 interrupt service register */ + /* + * 0x3e0 interrupt level polarity selection + * register 0 for high level trigger + */ + uint64_t int_polarity; + + uint8_t route_entry[64]; /*0x100 - 0x138*/ + uint8_t htmsi_vector[64]; /*0x200 - 0x238*/ +}; +typedef struct KVMLoongArchPCHPIC KVMLoongArchPCHPIC; +DECLARE_INSTANCE_CHECKER(KVMLoongArchPCHPIC, KVM_LOONGARCH_PCH_PIC, + TYPE_KVM_LOONGARCH_PCH_PIC) + +struct KVMLoongArchPCHPICClass { + SysBusDeviceClass parent_class; + DeviceRealize parent_realize; + + bool is_created; + int dev_fd; +}; +typedef struct KVMLoongArchPCHPICClass KVMLoongArchPCHPICClass; +DECLARE_CLASS_CHECKERS(KVMLoongArchPCHPICClass, KVM_LOONGARCH_PCH_PIC, + TYPE_KVM_LOONGARCH_PCH_PIC) diff --git a/include/hw/loongarch/boot.h b/include/hw/loongarch/boot.h new file mode 100644 index 0000000000000000000000000000000000000000..b3b870df1f097662dff7fc7c173f152da583d374 --- /dev/null +++ b/include/hw/loongarch/boot.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Definitions for LoongArch boot. + * + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#ifndef HW_LOONGARCH_BOOT_H +#define HW_LOONGARCH_BOOT_H + +/* UEFI 2.10 */ +#define EFI_SYSTEM_TABLE_SIGNATURE 0x5453595320494249 +#define EFI_2_100_SYSTEM_TABLE_REVISION ((2<<16) | (100)) +#define EFI_SPECIFICATION_VERSION EFI_SYSTEM_TABLE_REVISION +#define EFI_SYSTEM_TABLE_REVISION EFI_2_100_SYSTEM_TABLE_REVISION + +#define FW_VERSION 0x1 +#define FW_PATCHLEVEL 0x0 + +typedef struct { + uint8_t b[16]; +} efi_guid_t QEMU_ALIGNED(8); + +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define LINUX_EFI_BOOT_MEMMAP_GUID \ + EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, \ + 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) + +#define LINUX_EFI_INITRD_MEDIA_GUID \ + EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, \ + 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) + +#define DEVICE_TREE_GUID \ + EFI_GUID(0xb1b621d5, 0xf19c, 0x41a5, 0x83, 0x0b, \ + 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0) + +struct efi_config_table { + efi_guid_t guid; + uint64_t *ptr; + const char name[16]; +}; + +typedef struct { + uint64_t signature; + uint32_t revision; + uint32_t headersize; + uint32_t crc32; + uint32_t reserved; +} efi_table_hdr_t; + +struct efi_configuration_table { + efi_guid_t guid; + void *table; +}; + +struct efi_system_table { + efi_table_hdr_t hdr; + uint64_t fw_vendor; /* physical addr of CHAR16 vendor string */ + uint32_t fw_revision; + uint64_t con_in_handle; + uint64_t *con_in; + uint64_t con_out_handle; + uint64_t *con_out; + uint64_t stderr_handle; + uint64_t stderr_placeholder; + uint64_t *runtime; + uint64_t *boottime; + uint64_t nr_tables; + struct efi_configuration_table *tables; +}; + +typedef struct { + uint32_t type; + uint32_t pad; + uint64_t phys_addr; + uint64_t virt_addr; + uint64_t num_pages; + uint64_t attribute; +} efi_memory_desc_t; + +struct efi_boot_memmap { + uint64_t map_size; + uint64_t desc_size; + uint32_t desc_ver; + uint64_t map_key; + uint64_t buff_size; + efi_memory_desc_t map[32]; +}; + +struct efi_initrd { + uint64_t base; + uint64_t size; +}; + +struct loongarch_boot_info { + uint64_t ram_size; + const char *kernel_filename; + const char *kernel_cmdline; + const char *initrd_filename; + uint64_t a0, a1, a2; +}; + +extern struct memmap_entry *memmap_table; +extern unsigned memmap_entries; + +struct memmap_entry { + uint64_t address; + uint64_t length; + uint32_t type; + uint32_t reserved; +}; + +void loongarch_load_kernel(MachineState *ms, struct loongarch_boot_info *info); + +#endif /* HW_LOONGARCH_BOOT_H */ diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h index 674f4655e0ee974c215a23234cff2e67fa2d03e1..0edce84d6db7dca7f004f57a475ca35216dea46a 100644 --- a/include/hw/loongarch/virt.h +++ b/include/hw/loongarch/virt.h @@ -8,29 +8,51 @@ #ifndef HW_LOONGARCH_H #define HW_LOONGARCH_H -#include "target/loongarch/cpu.h" #include "hw/boards.h" #include "qemu/queue.h" #include "hw/intc/loongarch_ipi.h" #include "hw/block/flash.h" +#include "hw/loongarch/boot.h" #define LOONGARCH_MAX_CPUS 256 #define VIRT_FWCFG_BASE 0x1e020000UL #define VIRT_BIOS_BASE 0x1c000000UL -#define VIRT_BIOS_SIZE (4 * MiB) +#define VIRT_BIOS_SIZE (16 * MiB) #define VIRT_FLASH_SECTOR_SIZE (128 * KiB) -#define VIRT_FLASH_BASE 0x1d000000UL -#define VIRT_FLASH_SIZE (16 * MiB) +#define VIRT_FLASH0_BASE VIRT_BIOS_BASE +#define VIRT_FLASH0_SIZE VIRT_BIOS_SIZE +#define VIRT_FLASH1_BASE 0x1d000000UL +#define VIRT_FLASH1_SIZE (16 * MiB) #define VIRT_LOWMEM_BASE 0 #define VIRT_LOWMEM_SIZE 0x10000000 -#define VIRT_HIGHMEM_BASE 0x90000000 +#define VIRT_HIGHMEM_BASE 0x80000000 #define VIRT_GED_EVT_ADDR 0x100e0000 #define VIRT_GED_MEM_ADDR (VIRT_GED_EVT_ADDR + ACPI_GED_EVT_SEL_LEN) #define VIRT_GED_REG_ADDR (VIRT_GED_MEM_ADDR + MEMORY_HOTPLUG_IO_LEN) +#define VIRT_GED_CPUHP_ADDR (VIRT_GED_REG_ADDR + ACPI_GED_REG_COUNT) -struct LoongArchMachineState { +#define COMMAND_LINE_SIZE 512 + +#define FDT_BASE 0x100000 + +/* KVM_IRQ_LINE irq field index values */ +#define KVM_LOONGARCH_IRQ_TYPE_SHIFT 24 +#define KVM_LOONGARCH_IRQ_TYPE_MASK 0xff +#define KVM_LOONGARCH_IRQ_VCPU_SHIFT 16 +#define KVM_LOONGARCH_IRQ_VCPU_MASK 0xff +#define KVM_LOONGARCH_IRQ_NUM_SHIFT 0 +#define KVM_LOONGARCH_IRQ_NUM_MASK 0xffff + +/* irq_type field */ +#define KVM_LOONGARCH_IRQ_TYPE_CPU_IP 0 +#define KVM_LOONGARCH_IRQ_TYPE_CPU_IO 1 +#define KVM_LOONGARCH_IRQ_TYPE_HT 2 +#define KVM_LOONGARCH_IRQ_TYPE_MSI 3 +#define KVM_LOONGARCH_IRQ_TYPE_IOAPIC 4 + +struct LoongArchVirtMachineState { /*< private >*/ MachineState parent_obj; @@ -43,17 +65,25 @@ struct LoongArchMachineState { Notifier machine_done; Notifier powerdown_notifier; OnOffAuto acpi; + OnOffAuto veiointc; char *oem_id; char *oem_table_id; DeviceState *acpi_ged; int fdt_size; DeviceState *platform_bus_dev; + DeviceState *extioi; PCIBus *pci_bus; - PFlashCFI01 *flash; + PFlashCFI01 *flash[2]; + MemoryRegion system_iocsr; + MemoryRegion iocsr_mem; + AddressSpace as_iocsr; + int features; + struct loongarch_boot_info bootinfo; + DeviceState *ipi; }; -#define TYPE_LOONGARCH_MACHINE MACHINE_TYPE_NAME("virt") -OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE) -bool loongarch_is_acpi_enabled(LoongArchMachineState *lams); -void loongarch_acpi_setup(LoongArchMachineState *lams); +#define TYPE_LOONGARCH_VIRT_MACHINE MACHINE_TYPE_NAME("virt") +OBJECT_DECLARE_SIMPLE_TYPE(LoongArchVirtMachineState, LOONGARCH_VIRT_MACHINE) +void loongarch_acpi_setup(LoongArchVirtMachineState *lvms); +void reset_load_elf(void *opaque); #endif diff --git a/include/hw/misc/mos6522.h b/include/hw/misc/mos6522.h index fba45668abae32b06b82deb4df404b3f046d81d3..920871a598bc68254f6116e15424cd23b5ebcd0d 100644 --- a/include/hw/misc/mos6522.h +++ b/include/hw/misc/mos6522.h @@ -154,7 +154,7 @@ struct MOS6522State { OBJECT_DECLARE_TYPE(MOS6522State, MOS6522DeviceClass, MOS6522) struct MOS6522DeviceClass { - DeviceClass parent_class; + SysBusDeviceClass parent_class; ResettablePhases parent_phases; void (*portB_write)(MOS6522State *dev); diff --git a/include/hw/pci-host/gpex.h b/include/hw/pci-host/gpex.h index b0240bd7681fb36db4e6b64ed056855e28c363de..65475f7f9d128640b876c317ef7daff9287168e7 100644 --- a/include/hw/pci-host/gpex.h +++ b/include/hw/pci-host/gpex.h @@ -64,6 +64,7 @@ struct GPEXConfig { MemMapEntry pio; int irq; PCIBus *bus; + bool preserve_config; }; int gpex_set_irq_num(GPEXHost *s, int index, int gsi); diff --git a/include/hw/pci-host/ls7a.h b/include/hw/pci-host/ls7a.h index e753449593bf035d0b82cd99e6a0d338556aba45..79d4ea85012747bf27449b7d02c29d9a577d49c9 100644 --- a/include/hw/pci-host/ls7a.h +++ b/include/hw/pci-host/ls7a.h @@ -24,6 +24,8 @@ #define VIRT_PCH_REG_BASE 0x10000000UL #define VIRT_IOAPIC_REG_BASE (VIRT_PCH_REG_BASE) #define VIRT_PCH_MSI_ADDR_LOW 0x2FF00000UL +#define VIRT_PCH_REG_SIZE 0x400 +#define VIRT_PCH_MSI_SIZE 0x8 /* * GSI_BASE is hard-coded with 64 in linux kernel, else kernel fails to boot @@ -34,17 +36,18 @@ #define VIRT_PCH_PIC_IRQ_NUM 32 #define VIRT_GSI_BASE 64 #define VIRT_DEVICE_IRQS 16 +#define VIRT_UART_COUNT 4 #define VIRT_UART_IRQ (VIRT_GSI_BASE + 2) #define VIRT_UART_BASE 0x1fe001e0 -#define VIRT_UART_SIZE 0X100 -#define VIRT_RTC_IRQ (VIRT_GSI_BASE + 3) +#define VIRT_UART_SIZE 0x100 +#define VIRT_RTC_IRQ (VIRT_GSI_BASE + 6) #define VIRT_MISC_REG_BASE (VIRT_PCH_REG_BASE + 0x00080000) #define VIRT_RTC_REG_BASE (VIRT_MISC_REG_BASE + 0x00050100) #define VIRT_RTC_LEN 0x100 -#define VIRT_SCI_IRQ (VIRT_GSI_BASE + 4) +#define VIRT_SCI_IRQ (VIRT_GSI_BASE + 7) #define VIRT_PLATFORM_BUS_BASEADDRESS 0x16000000 #define VIRT_PLATFORM_BUS_SIZE 0x2000000 #define VIRT_PLATFORM_BUS_NUM_IRQS 2 -#define VIRT_PLATFORM_BUS_IRQ (VIRT_GSI_BASE + 5) +#define VIRT_PLATFORM_BUS_IRQ (VIRT_GSI_BASE + 8) #endif diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index fa6313aabc43b81594ee95d1d17bd9653aeb7a69..0dfe274c33f29c3f327d1f4a68a5fc77acbd8512 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -3,6 +3,7 @@ #include "exec/memory.h" #include "sysemu/dma.h" +#include "sysemu/host_iommu_device.h" /* PCI includes legacy ISA access. */ #include "hw/isa/isa.h" @@ -15,7 +16,7 @@ extern bool pci_available; #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff) #define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) #define PCI_FUNC(devfn) ((devfn) & 0x07) -#define PCI_BUILD_BDF(bus, devfn) ((bus << 8) | (devfn)) +#define PCI_BUILD_BDF(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BDF_TO_DEVFN(x) ((x) & 0xff) #define PCI_BUS_MAX 256 #define PCI_DEVFN_MAX 256 @@ -384,10 +385,58 @@ typedef struct PCIIOMMUOps { * * @devfn: device and function number */ - AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn); + /** + * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU + * + * Optional callback, if not implemented in vIOMMU, then vIOMMU can't + * retrieve host information from the associated HostIOMMUDevice. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + * + * @dev: the #HostIOMMUDevice to attach. + * + * @errp: pass an Error out only when return false + * + * Returns: true if HostIOMMUDevice is attached or else false with errp set. + */ + bool (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn, + HostIOMMUDevice *dev, Error **errp); + /** + * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn); + /** + * @get_pasid_cap: get pasid capability from vIOMMU + * + * Optional callback. + * + * @bus: the #PCIBus of the PCI device. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number of the PCI device. + */ + bool (*get_pasid_cap)(PCIBus *bus, void *opaque, int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); +bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, + Error **errp); +void pci_device_unset_iommu_device(PCIDevice *dev); +bool pci_device_get_pasid_cap(PCIDevice *dev); /** * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus @@ -632,16 +681,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev) pci_set_irq(pci_dev, 0); } -/* - * FIXME: PCI does not work this way. - * All the callers to this method should be fixed. - */ -static inline void pci_irq_pulse(PCIDevice *pci_dev) -{ - pci_irq_assert(pci_dev); - pci_irq_deassert(pci_dev); -} - MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); void pci_set_power(PCIDevice *pci_dev, bool state); diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index d3dd0f64b273ad1d192e43afd27c020a414f0a6b..253b48a688b06a356375e4c5c2d12b054d371f87 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -160,6 +160,9 @@ struct PCIDevice { /* ID of standby device in net_failover pair */ char *failover_pair_id; uint32_t acpi_index; + + /* Maximum DMA bounce buffer size used for indirect memory map requests */ + uint32_t max_bounce_buffer_size; }; static inline int pci_intx(PCIDevice *pci_dev) diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index 11f5a91bbb7ec318912e2440daec4cf7d3e5d669..41ee27f0234aa5b19bb158c0c49504af2650e729 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -79,6 +79,9 @@ struct PCIExpressDevice { uint16_t sriov_cap; PCIESriovPF sriov_pf; PCIESriovVF sriov_vf; + + /* Offset of PASID capability in config space */ + uint16_t pasid_cap; }; #define COMPAT_PROP_PCP "power_controller_present" @@ -147,4 +150,5 @@ void pcie_cap_slot_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); void pcie_cap_slot_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); +void pcie_pasid_init(PCIDevice *dev, uint16_t offset, uint16_t caps); #endif /* QEMU_PCIE_H */ diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index 90e6cf45b873721bf07a61af7b55af8ecf1f0387..7148a0959bff3a1d0babdf9872f98b1895829f0a 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -56,6 +56,9 @@ struct PCIESlot { uint8_t chassis; uint16_t slot; + uint8_t fast_plug; + uint8_t fast_unplug; + PCIExpLinkSpeed speed; PCIExpLinkWidth width; diff --git a/include/hw/ppc/mac_dbdma.h b/include/hw/ppc/mac_dbdma.h index 4a3f644516b3bbbb12d5b8b42d9b1b078d68e051..c774f6bf84f1a4f6b40306ad1f76da8e3bd42041 100644 --- a/include/hw/ppc/mac_dbdma.h +++ b/include/hw/ppc/mac_dbdma.h @@ -44,10 +44,6 @@ struct DBDMA_io { DBDMA_end dma_end; /* DMA is in progress, don't start another one */ bool processing; - /* DMA request */ - void *dma_mem; - dma_addr_t dma_len; - DMADirection dir; }; /* diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index f120874e0ff192a4676b275364146c267fadd3f7..00023c02338cd51239d7a584ceacfbd3e8377641 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -218,7 +218,7 @@ static inline bool xive_source_esb_has_2page(XiveSource *xsrc) xsrc->esb_shift == XIVE_ESB_4K_2PAGE; } -static inline size_t xive_source_esb_len(XiveSource *xsrc) +static inline uint64_t xive_source_esb_len(XiveSource *xsrc) { return (1ull << xsrc->esb_shift) * xsrc->nr_irqs; } diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 151d9682380d292357a5b28c77824e15050fcb55..2d3661d6cdde5f88539b568742ea0a2b2d2b05be 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -739,6 +739,8 @@ qemu_irq qdev_get_gpio_out_connector(DeviceState *dev, const char *name, int n); */ qemu_irq qdev_intercept_gpio_out(DeviceState *dev, qemu_irq icpt, const char *name, int n); +qemu_irq qdev_disconnect_gpio_out_named(DeviceState *dev, + const char *name, int n); BusState *qdev_get_child_bus(DeviceState *dev, const char *name); diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 91f7a2452d94b777293b5da92fd096b84ff113b4..63dcf69978fd136f4da43cd5a394b3eb73d6de51 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -8,8 +8,11 @@ extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; +extern const PropertyInfo qdev_prop_zero_page_detection; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; +extern const PropertyInfo qdev_prop_blockdev_retry_interval; +extern const PropertyInfo qdev_prop_blockdev_retry_timeout; extern const PropertyInfo qdev_prop_bios_chs_trans; extern const PropertyInfo qdev_prop_fdc_drive_type; extern const PropertyInfo qdev_prop_drive; @@ -46,12 +49,21 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ MigMode) +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ + ZeroPageDetection) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) #define DEFINE_PROP_BLOCKDEV_ON_ERROR(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_on_error, \ BlockdevOnError) +#define DEFINE_PROP_BLOCKDEV_RETRY_INTERVAL(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_interval, \ + int64_t) +#define DEFINE_PROP_BLOCKDEV_RETRY_TIMEOUT(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_blockdev_retry_timeout, \ + int64_t) #define DEFINE_PROP_BIOS_CHS_TRANS(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_bios_chs_trans, int) #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f) \ diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index 25743a29a000b15d7a15b15e6d83fa3a279d1edd..63602c2c74bac745ad7c124359604b2ea46052e3 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -60,6 +60,7 @@ extern const PropertyInfo qdev_prop_int64; extern const PropertyInfo qdev_prop_size; extern const PropertyInfo qdev_prop_string; extern const PropertyInfo qdev_prop_on_off_auto; +extern const PropertyInfo qdev_prop_compress_method; extern const PropertyInfo qdev_prop_size32; extern const PropertyInfo qdev_prop_array; extern const PropertyInfo qdev_prop_link; @@ -168,6 +169,9 @@ extern const PropertyInfo qdev_prop_link; DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*) #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto) +#define DEFINE_PROP_COMPRESS_METHOD(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_compress_method, \ + CompressMethod) #define DEFINE_PROP_SIZE32(_n, _s, _f, _d) \ DEFINE_PROP_UNSIGNED(_n, _s, _f, _d, qdev_prop_size32, uint32_t) diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 3692ca82f3154064e48f811b448bf7eb80303de3..6ec18bf12b1218810abd86c431d6b040f481c804 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -226,6 +226,7 @@ void scsi_req_cancel_complete(SCSIRequest *req); void scsi_req_cancel(SCSIRequest *req); void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier); void scsi_req_retry(SCSIRequest *req); +void scsi_retry_requests(SCSIDevice *s); void scsi_device_drained_begin(SCSIDevice *sdev); void scsi_device_drained_end(SCSIDevice *sdev); void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense); diff --git a/include/hw/usb.h b/include/hw/usb.h index 32c23a5ca2a0559328800bc117137d25b3ed8e8e..911179158d496c4e94ff04e9650aec28859e2108 100644 --- a/include/hw/usb.h +++ b/include/hw/usb.h @@ -142,6 +142,7 @@ #define USB_DEVICE_SELF_POWERED 0 #define USB_DEVICE_REMOTE_WAKEUP 1 +#define USB_DEVICE_REMOTE_WAKEUP_IS_SUPPORTED 2 #define USB_DT_DEVICE 0x01 #define USB_DT_CONFIG 0x02 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index a4a22accb9434c5d04bef73172a0ccbe182405cb..abae8655c4c9df3f3c948da83f9852d339f87528 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -30,6 +30,9 @@ #include #endif #include "sysemu/sysemu.h" +#include "hw/vfio/vfio-container-base.h" +#include "sysemu/host_iommu_device.h" +#include "sysemu/iommufd.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -72,54 +75,29 @@ typedef struct VFIOMigration { bool initial_data_sent; } VFIOMigration; -typedef struct VFIOAddressSpace { - AddressSpace *as; - QLIST_HEAD(, VFIOContainer) containers; - QLIST_ENTRY(VFIOAddressSpace) list; -} VFIOAddressSpace; - struct VFIOGroup; +#define TYPE_HOST_IOMMU_DEVICE_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio" +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO \ + TYPE_HOST_IOMMU_DEVICE_IOMMUFD "-vfio" + +typedef struct VFIODMARange { + QLIST_ENTRY(VFIODMARange) next; + hwaddr iova; + size_t size; + void *vaddr; /* unused */ + unsigned long *bitmap; /* dirty bitmap cache for this range */ +} VFIODMARange; + typedef struct VFIOContainer { - VFIOAddressSpace *space; + VFIOContainerBase bcontainer; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ - MemoryListener listener; - MemoryListener prereg_listener; unsigned iommu_type; - Error *error; - bool initialized; - bool dirty_pages_supported; - uint64_t dirty_pgsizes; - uint64_t max_dirty_bitmap_size; - unsigned long pgsizes; - unsigned int dma_max_mappings; - QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; - QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; + bool dirty_log_manual_clear; QLIST_HEAD(, VFIOGroup) group_list; - QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; - QLIST_ENTRY(VFIOContainer) next; - QLIST_HEAD(, VFIODevice) device_list; - GList *iova_ranges; + QLIST_HEAD(, VFIODMARange) dma_list; } VFIOContainer; -typedef struct VFIOGuestIOMMU { - VFIOContainer *container; - IOMMUMemoryRegion *iommu_mr; - hwaddr iommu_offset; - IOMMUNotifier n; - QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; -} VFIOGuestIOMMU; - -typedef struct VFIORamDiscardListener { - VFIOContainer *container; - MemoryRegion *mr; - hwaddr offset_within_address_space; - hwaddr size; - uint64_t granularity; - RamDiscardListener listener; - QLIST_ENTRY(VFIORamDiscardListener) next; -} VFIORamDiscardListener; - typedef struct VFIOHostDMAWindow { hwaddr min_iova; hwaddr max_iova; @@ -127,6 +105,22 @@ typedef struct VFIOHostDMAWindow { QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; } VFIOHostDMAWindow; +typedef struct IOMMUFDBackend IOMMUFDBackend; + +typedef struct VFIOIOASHwpt { + uint32_t hwpt_id; + uint32_t hwpt_flags; + QLIST_HEAD(, VFIODevice) device_list; + QLIST_ENTRY(VFIOIOASHwpt) next; +} VFIOIOASHwpt; + +typedef struct VFIOIOMMUFDContainer { + VFIOContainerBase bcontainer; + IOMMUFDBackend *be; + uint32_t ioas_id; + QLIST_HEAD(, VFIOIOASHwpt) hwpt_list; +} VFIOIOMMUFDContainer; + typedef struct VFIODeviceOps VFIODeviceOps; typedef struct VFIODevice { @@ -134,12 +128,13 @@ typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) container_next; QLIST_ENTRY(VFIODevice) global_next; struct VFIOGroup *group; - VFIOContainer *container; + VFIOContainerBase *bcontainer; char *sysfsdev; char *name; DeviceState *dev; int fd; int type; + bool mdev; bool reset_works; bool needs_reset; bool no_mmap; @@ -152,8 +147,15 @@ typedef struct VFIODevice { VFIOMigration *migration; Error *migration_blocker; OnOffAuto pre_copy_dirty_page_tracking; + OnOffAuto device_dirty_page_tracking; bool dirty_pages_supported; bool dirty_tracking; + bool iommu_dirty_tracking; + HostIOMMUDevice *hiod; + int devid; + IOMMUFDBackend *iommufd; + VFIOIOASHwpt *hwpt; + QLIST_ENTRY(VFIODevice) hwpt_next; } VFIODevice; struct VFIODeviceOps { @@ -201,31 +203,14 @@ typedef struct VFIODisplay { } dmabuf; } VFIODisplay; -typedef struct { - unsigned long *bitmap; - hwaddr size; - hwaddr pages; -} VFIOBitmap; - VFIOAddressSpace *vfio_get_address_space(AddressSpace *as); void vfio_put_address_space(VFIOAddressSpace *space); -bool vfio_devices_all_running_and_saving(VFIOContainer *container); -/* container->fd */ -int vfio_dma_unmap(VFIOContainer *container, hwaddr iova, - ram_addr_t size, IOMMUTLBEntry *iotlb); -int vfio_dma_map(VFIOContainer *container, hwaddr iova, - ram_addr_t size, void *vaddr, bool readonly); -int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start); -int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap, - hwaddr iova, hwaddr size); +VFIODMARange *vfio_lookup_match_range(const VFIOContainer *container, + hwaddr start_addr, hwaddr size); +void vfio_dma_range_init_dirty_bitmap(VFIODMARange *qrange); /* SPAPR specific */ -int vfio_container_add_section_window(VFIOContainer *container, - MemoryRegionSection *section, - Error **errp); -void vfio_container_del_section_window(VFIOContainer *container, - MemoryRegionSection *section); int vfio_spapr_container_init(VFIOContainer *container, Error **errp); void vfio_spapr_container_deinit(VFIOContainer *container); @@ -247,6 +232,8 @@ void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); struct vfio_device_info *vfio_get_device_info(int fd); +bool vfio_device_is_mdev(VFIODevice *vbasedev); +bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp); int vfio_attach_device(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void vfio_detach_device(VFIODevice *vbasedev); @@ -259,7 +246,6 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIOGroupList vfio_group_list; extern VFIODeviceList vfio_device_list; - extern const MemoryListener vfio_memory_listener; extern int vfio_kvm_device_fd; @@ -292,11 +278,20 @@ bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); void vfio_migration_exit(VFIODevice *vbasedev); int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size); -bool vfio_devices_all_running_and_mig_active(VFIOContainer *container); -bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container); -int vfio_devices_query_dirty_bitmap(VFIOContainer *container, +bool +vfio_devices_all_running_and_mig_active(const VFIOContainerBase *bcontainer); +bool +vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer); +int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer, VFIOBitmap *vbmap, hwaddr iova, hwaddr size); -int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, - uint64_t size, ram_addr_t ram_addr); +int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova, + uint64_t size, ram_addr_t ram_addr); + +/* Returns 0 on success, or a negative errno. */ +int vfio_device_get_name(VFIODevice *vbasedev, Error **errp); +void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); +void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, + DeviceState *dev, bool ram_discard); +int vfio_device_get_aw_bits(VFIODevice *vdev); #endif /* HW_VFIO_VFIO_COMMON_H */ diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h new file mode 100644 index 0000000000000000000000000000000000000000..7a4c575115a74112bff1eb21890b2e7f7b1014e6 --- /dev/null +++ b/include/hw/vfio/vfio-container-base.h @@ -0,0 +1,143 @@ +/* + * VFIO BASE CONTAINER + * + * Copyright (C) 2023 Intel Corporation. + * Copyright Red Hat, Inc. 2023 + * + * Authors: Yi Liu + * Eric Auger + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_VFIO_CONTAINER_BASE_H +#define HW_VFIO_VFIO_CONTAINER_BASE_H + +#include "exec/memory.h" + +typedef struct VFIODevice VFIODevice; +typedef struct VFIOIOMMUClass VFIOIOMMUClass; + +typedef struct { + unsigned long *bitmap; + hwaddr size; + hwaddr pages; +} VFIOBitmap; + +typedef struct VFIOAddressSpace { + AddressSpace *as; + QLIST_HEAD(, VFIOContainerBase) containers; + QLIST_ENTRY(VFIOAddressSpace) list; +} VFIOAddressSpace; + +/* + * This is the base object for vfio container backends + */ +typedef struct VFIOContainerBase { + const VFIOIOMMUClass *ops; + VFIOAddressSpace *space; + MemoryListener listener; + Error *error; + bool initialized; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; + unsigned long pgsizes; + unsigned int dma_max_mappings; + bool dirty_pages_supported; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; + QLIST_HEAD(, VFIORamDiscardListener) vrdl_list; + QLIST_ENTRY(VFIOContainerBase) next; + QLIST_HEAD(, VFIODevice) device_list; + GList *iova_ranges; +} VFIOContainerBase; + +typedef struct VFIOGuestIOMMU { + VFIOContainerBase *bcontainer; + IOMMUMemoryRegion *iommu_mr; + hwaddr iommu_offset; + IOMMUNotifier n; + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +} VFIOGuestIOMMU; + +typedef struct VFIORamDiscardListener { + VFIOContainerBase *bcontainer; + MemoryRegion *mr; + hwaddr offset_within_address_space; + hwaddr size; + uint64_t granularity; + RamDiscardListener listener; + QLIST_ENTRY(VFIORamDiscardListener) next; +} VFIORamDiscardListener; + +int vfio_container_dma_map(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); +int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); +int vfio_container_add_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); +void vfio_container_del_section_window(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); +int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer, + bool start); +int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); + +void vfio_container_init(VFIOContainerBase *bcontainer, + VFIOAddressSpace *space, + const VFIOIOMMUClass *ops); +void vfio_container_destroy(VFIOContainerBase *bcontainer); + + +#define TYPE_VFIO_IOMMU "vfio-iommu" +#define TYPE_VFIO_IOMMU_LEGACY TYPE_VFIO_IOMMU "-legacy" +#define TYPE_VFIO_IOMMU_SPAPR TYPE_VFIO_IOMMU "-spapr" +#define TYPE_VFIO_IOMMU_IOMMUFD TYPE_VFIO_IOMMU "-iommufd" + +/* + * VFIOContainerBase is not an abstract QOM object because it felt + * unnecessary to expose all the IOMMU backends to the QEMU machine + * and human interface. However, we can still abstract the IOMMU + * backend handlers using a QOM interface class. This provides more + * flexibility when referencing the various implementations. + */ +DECLARE_CLASS_CHECKERS(VFIOIOMMUClass, VFIO_IOMMU, TYPE_VFIO_IOMMU) + +struct VFIOIOMMUClass { + InterfaceClass parent_class; + + /* Properties */ + const char *hiod_typename; + + /* basic feature */ + int (*setup)(VFIOContainerBase *bcontainer, Error **errp); + int (*dma_map)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly); + int (*dma_unmap)(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb); + int (*attach_device)(const char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp); + void (*detach_device)(VFIODevice *vbasedev); + /* migration feature */ + int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer, + bool start); + int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer, + VFIOBitmap *vbmap, + hwaddr iova, hwaddr size); + /* PCI specific */ + int (*pci_hot_reset)(VFIODevice *vbasedev, bool single); + + /* SPAPR specific */ + int (*add_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + Error **errp); + void (*del_window)(VFIOContainerBase *bcontainer, + MemoryRegionSection *section); + void (*release)(VFIOContainerBase *bcontainer); +}; +#endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ diff --git a/include/hw/virtio/vdpa-dev-iommufd.h b/include/hw/virtio/vdpa-dev-iommufd.h new file mode 100644 index 0000000000000000000000000000000000000000..8e56647690a0bf8a92ee82384d6b80a75b530728 --- /dev/null +++ b/include/hw/virtio/vdpa-dev-iommufd.h @@ -0,0 +1,41 @@ +/* + * vhost vDPA device support iommufd header + * + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All Rights Reserved. + */ + +#ifndef _VHOST_VDPA_IOMMUFD_H +#define _VHOST_VDPA_IOMMUFD_H + +#include "hw/virtio/vdpa-dev.h" + +/* + * A HW pagetable is called an iommu_domain inside the kernel. + * This user object allows directly creating an inspecting the + * domains. Domains that have kernel owned page tables will be + * associated with an iommufd_ioas that provides the IOVA to + * PFN map. + */ +typedef struct IOMMUFDHWPT { + uint32_t hwpt_id; + QLIST_HEAD(, VhostVdpaDevice) device_list; + QLIST_ENTRY(IOMMUFDHWPT) next; +} IOMMUFDHWPT; + +typedef struct VDPAIOMMUFDContainer { + MemoryListener listener; + struct IOMMUFDBackend *iommufd; + uint32_t ioas_id; + QLIST_HEAD(, IOMMUFDHWPT) hwpt_list; + QLIST_ENTRY(VDPAIOMMUFDContainer) next; +} VDPAIOMMUFDContainer; + +struct vdpa_dev_bind_iommufd { + __s32 iommufd; + __u32 out_devid; +}; + +int vhost_vdpa_attach_container(VhostVdpaDevice *vdev); +void vhost_vdpa_detach_container(VhostVdpaDevice *vdev); + +#endif /* _VHOST_VDPA_IOMMUFD_H */ diff --git a/include/hw/virtio/vdpa-dev-mig.h b/include/hw/virtio/vdpa-dev-mig.h new file mode 100644 index 0000000000000000000000000000000000000000..adc1d657f70aa500cb9a6578c6a9444c7c0677b9 --- /dev/null +++ b/include/hw/virtio/vdpa-dev-mig.h @@ -0,0 +1,29 @@ +/* + * Vhost Vdpa Device Migration Header + * + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All Rights Reserved. + */ + +#ifndef _VHOST_VDPA_MIGRATION_H +#define _VHOST_VDPA_MIGRATION_H + +#include "hw/virtio/vdpa-dev.h" + +enum { + VDPA_DEVICE_START, + VDPA_DEVICE_STOP, + VDPA_DEVICE_PRE_START, + VDPA_DEVICE_PRE_STOP, + VDPA_DEVICE_CANCEL, + VDPA_DEVICE_POST_START, + VDPA_DEVICE_START_ASYNC, + VDPA_DEVICE_STOP_ASYNC, + VDPA_DEVICE_PRE_START_ASYNC, + VDPA_DEVICE_QUERY_OP_STATE, +}; + +void vdpa_migration_register(VhostVdpaDevice *vdev); + +void vdpa_migration_unregister(VhostVdpaDevice *vdev); + +#endif /* _VHOST_VDPA_MIGRATION_H */ diff --git a/include/hw/virtio/vdpa-dev.h b/include/hw/virtio/vdpa-dev.h index 4dbf98195c4cc1348494c16275cb71717b9482ed..872e630546c251fe18a9345ffc302cc2930d17ef 100644 --- a/include/hw/virtio/vdpa-dev.h +++ b/include/hw/virtio/vdpa-dev.h @@ -18,6 +18,7 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-vdpa.h" #include "qom/object.h" +#include "sysemu/iommufd.h" #define TYPE_VHOST_VDPA_DEVICE "vhost-vdpa-device" @@ -37,7 +38,13 @@ struct VhostVdpaDevice { int config_size; uint16_t queue_size; bool started; + bool suspended; int (*post_init)(VhostVdpaDevice *v, Error **errp); + VMChangeStateEntry *vmstate; + Notifier migration_state; + IOMMUFDBackend *iommufd; + uint32_t iommufd_devid; + QLIST_ENTRY(VhostVdpaDevice) next; }; #endif diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index a86d103f8245b2b8602bd534960bf012e52cceb5..84b8fa1075763ce5961517034094b80e9e683621 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -65,6 +65,11 @@ typedef int (*vhost_scsi_get_abi_version_op)(struct vhost_dev *dev, int *version); typedef int (*vhost_set_log_base_op)(struct vhost_dev *dev, uint64_t base, struct vhost_log *log); +typedef int (*vhost_set_log_size_op)(struct vhost_dev *dev, uint64_t size, + struct vhost_log *log); +typedef int (*vhost_set_log_fd_op)(struct vhost_dev *dev, int fd, + struct vhost_log *log); +typedef int (*vhost_log_sync_op)(struct vhost_dev *dev); typedef int (*vhost_set_mem_table_op)(struct vhost_dev *dev, struct vhost_memory *mem); typedef int (*vhost_set_vring_addr_op)(struct vhost_dev *dev, @@ -150,6 +155,9 @@ typedef int (*vhost_set_device_state_fd_op)(struct vhost_dev *dev, Error **errp); typedef int (*vhost_check_device_state_op)(struct vhost_dev *dev, Error **errp); +typedef int (*vhost_dev_suspend_op)(struct vhost_dev *dev); +typedef int (*vhost_dev_resume_op)(struct vhost_dev *dev); + typedef struct VhostOps { VhostBackendType backend_type; vhost_backend_init vhost_backend_init; @@ -162,6 +170,9 @@ typedef struct VhostOps { vhost_scsi_clear_endpoint_op vhost_scsi_clear_endpoint; vhost_scsi_get_abi_version_op vhost_scsi_get_abi_version; vhost_set_log_base_op vhost_set_log_base; + vhost_set_log_size_op vhost_set_log_size; + vhost_set_log_fd_op vhost_set_log_fd; + vhost_log_sync_op vhost_log_sync; vhost_set_mem_table_op vhost_set_mem_table; vhost_set_vring_addr_op vhost_set_vring_addr; vhost_set_vring_endian_op vhost_set_vring_endian; @@ -200,6 +211,8 @@ typedef struct VhostOps { vhost_supports_device_state_op vhost_supports_device_state; vhost_set_device_state_fd_op vhost_set_device_state_fd; vhost_check_device_state_op vhost_check_device_state; + vhost_dev_suspend_op vhost_dev_suspend; + vhost_dev_resume_op vhost_dev_resume; } VhostOps; int vhost_backend_update_device_iotlb(struct vhost_dev *dev, diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h index 5407d54fd79bb6904729055fb05a8927d19ab17d..e32effc6e11feafa27420ac7dc631c6303e2ca21 100644 --- a/include/hw/virtio/vhost-vdpa.h +++ b/include/hw/virtio/vhost-vdpa.h @@ -42,8 +42,6 @@ typedef struct vhost_vdpa { bool shadow_vqs_enabled; /* Vdpa must send shadow addresses as IOTLB key for data queues, not GPA */ bool shadow_data; - /* Device suspended successfully */ - bool suspended; /* IOVA mapping used by the Shadow Virtqueue */ VhostIOVATree *iova_tree; GPtrArray *shadow_vqs; @@ -59,6 +57,13 @@ typedef struct vhost_vdpa { int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range); int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx); +Int128 vhost_vdpa_section_end(const MemoryRegionSection *section, + int page_mask); +bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, + uint64_t iova_min, + uint64_t iova_max, + int page_mask); + int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova, hwaddr size, void *vaddr, bool readonly); int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova, diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 02477788df5eb370ac1d7ddb1cf02859a07fa150..598ae137575304dac90c6bb5889901656d015b70 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -43,6 +43,7 @@ typedef unsigned long vhost_log_chunk_t; #define VHOST_LOG_PAGE 0x1000 #define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t)) #define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS) +#define VHOST_LOG_CHUNK_BYTES (VHOST_LOG_PAGE * sizeof(vhost_log_chunk_t)) #define VHOST_INVALID_FEATURE_BIT (0xff) #define VHOST_QUEUE_NUM_CONFIG_INR 0 @@ -132,6 +133,7 @@ struct vhost_dev { QLIST_HEAD(, vhost_iommu) iommu_list; IOMMUNotifier n; const VhostDevConfigOps *config_ops; + bool has_container; }; extern const VhostOps kernel_ops; @@ -205,6 +207,14 @@ static inline bool vhost_dev_is_started(struct vhost_dev *hdev) return hdev->started; } +/** + * vhost_bytemap_log_support() - check if the vhost device supports dirty bytemap + * @dev: common vhost_dev structure + * + * Return: true if the vhost device supports dirty bytemap, false otherwise. + */ +bool vhost_bytemap_log_support(struct vhost_dev *dev); + /** * vhost_dev_start() - start the vhost device * @hdev: common vhost_dev structure @@ -340,7 +350,14 @@ int vhost_dev_set_inflight(struct vhost_dev *dev, struct vhost_inflight *inflight); int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, struct vhost_inflight *inflight); +bool used_memslots_is_exceeded(void); bool vhost_dev_has_iommu(struct vhost_dev *dev); +int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last); +int vhost_sync_dirty_bytemap(struct vhost_dev *dev, + MemoryRegionSection *section); #ifdef CONFIG_VHOST int vhost_reset_device(struct vhost_dev *hdev); @@ -464,4 +481,7 @@ int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp); */ int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp); +int vhost_dev_resume(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); +int vhost_dev_suspend(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings); + #endif diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index c8f72850bc05f4c6ea7b51aae4b572b04882725f..672f7445dd0e489d805c49d0b16ef94cca83519c 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -22,6 +22,7 @@ #include "standard-headers/linux/virtio_config.h" #include "standard-headers/linux/virtio_ring.h" #include "qom/object.h" +#include "block/aio.h" /* * A guest should never accept this. It implies negotiation is broken @@ -60,6 +61,7 @@ size_t virtio_get_config_size(const VirtIOConfigSizeParams *params, typedef struct VirtQueue VirtQueue; #define VIRTQUEUE_MAX_SIZE 1024 +#define VIRTIO_NET_VQ_MAX_SIZE (4096) typedef struct VirtQueueElement { @@ -145,6 +147,7 @@ struct VirtIODevice bool use_started; bool started; bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */ + bool defer_kvm_irq_routing; bool disable_legacy_check; bool vhost_started; VMChangeStateEntry *vmstate; @@ -182,6 +185,7 @@ struct VirtioDeviceClass { int (*validate_features)(VirtIODevice *vdev); void (*get_config)(VirtIODevice *vdev, uint8_t *config); void (*set_config)(VirtIODevice *vdev, const uint8_t *config); + void (*print_features)(uint64_t features); void (*reset)(VirtIODevice *vdev); void (*set_status)(VirtIODevice *vdev, uint8_t val); /* Device must validate queue_index. */ @@ -270,9 +274,13 @@ void qemu_put_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, unsigned int out_bytes); -void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, - unsigned int *out_bytes, - unsigned max_in_bytes, unsigned max_out_bytes); +/** + * Return <0 on error or an opaque >=0 to pass to + * virtio_queue_enable_notification_and_check on success. + */ +int virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, unsigned max_in_bytes, + unsigned max_out_bytes); void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq); void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); @@ -306,6 +314,15 @@ int virtio_queue_ready(VirtQueue *vq); int virtio_queue_empty(VirtQueue *vq); +/** + * Enable notification and check whether guest has added some + * buffers since last call to virtqueue_get_avail_bytes. + * + * @opaque: value returned from virtqueue_get_avail_bytes + */ +bool virtio_queue_enable_notification_and_check(VirtQueue *vq, + int opaque); + /* Host binding interface. */ uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr); @@ -508,4 +525,10 @@ static inline bool virtio_device_disabled(VirtIODevice *vdev) bool virtio_legacy_allowed(VirtIODevice *vdev); bool virtio_legacy_check_disabled(VirtIODevice *vdev); +QEMUBH *virtio_bh_new_guarded_full(DeviceState *dev, + QEMUBHFunc *cb, void *opaque, + const char *name); +#define virtio_bh_new_guarded(dev, cb, opaque) \ + virtio_bh_new_guarded_full((dev), (cb), (opaque), (stringify(cb))) + #endif diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 965f5d5450034541ca548b3f11e1246a3695e0aa..60079086a800b2d49ba3a95f040866b00e83a7db 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -63,4 +63,6 @@ void monitor_register_hmp_info_hrt(const char *name, int error_vprintf_unless_qmp(const char *fmt, va_list ap) G_GNUC_PRINTF(1, 0); int error_printf_unless_qmp(const char *fmt, ...) G_GNUC_PRINTF(1, 2); +void monitor_qapi_event_discard_io_error(void); + #endif /* MONITOR_H */ diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h index 933a66ee87e258db1245bf9d0b56b74f47664fe0..49e4944457f5b024dc7811833790d408bb0ca058 100644 --- a/include/qemu/bswap.h +++ b/include/qemu/bswap.h @@ -138,6 +138,8 @@ CPU_CONVERT(le, 16, uint16_t) CPU_CONVERT(le, 32, uint32_t) CPU_CONVERT(le, 64, uint64_t) +#undef CPU_CONVERT + /* * Same as cpu_to_le{16,32,64}, except that gcc will figure the result is * a compile-time constant if you pass in a constant. So this can be diff --git a/include/qemu/chardev_open.h b/include/qemu/chardev_open.h new file mode 100644 index 0000000000000000000000000000000000000000..64e8fcfdcb239e47774d55dfc57efff9101ce0a8 --- /dev/null +++ b/include/qemu/chardev_open.h @@ -0,0 +1,16 @@ +/* + * QEMU Chardev Helper + * + * Copyright (C) 2023 Intel Corporation. + * + * Authors: Yi Liu + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef QEMU_CHARDEV_OPEN_H +#define QEMU_CHARDEV_OPEN_H + +int open_cdev(const char *devpath, dev_t cdev); +#endif diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h index 1da148552f74b542959dd48448eb6bbc3aead935..11b550a0fcbf55cf67c2ed0eaf7317e1a6128475 100644 --- a/include/qemu/coroutine_int.h +++ b/include/qemu/coroutine_int.h @@ -73,5 +73,6 @@ Coroutine *qemu_coroutine_new(void); void qemu_coroutine_delete(Coroutine *co); CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to, CoroutineAction action); - +void qemu_coroutine_info_add(const Coroutine *co_); +void qemu_coroutine_info_delete(const Coroutine *co_); #endif diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h index 8344daaa03f2d48819fdd3d6f1fef862b8eacd3a..63e4edfd2f9c041538274c51cf721332950e1426 100644 --- a/include/qemu/mmap-alloc.h +++ b/include/qemu/mmap-alloc.h @@ -1,6 +1,10 @@ #ifndef QEMU_MMAP_ALLOC_H #define QEMU_MMAP_ALLOC_H +#define HUGETLBFS_MAGIC 0x958458f6 + +size_t qemu_fd_getfiletype(int fd); + typedef enum { QEMU_FS_TYPE_UNKNOWN = 0, QEMU_FS_TYPE_TMPFS, diff --git a/include/qemu/range.h b/include/qemu/range.h index 205e1da76dc5b29327f590b8293a826ced63c25d..4ce694a398311a32e9b5e3ca97aed97c90c0ae09 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -20,6 +20,8 @@ #ifndef QEMU_RANGE_H #define QEMU_RANGE_H +#include "qemu/bitops.h" + /* * Operations on 64 bit address ranges. * Notes: @@ -217,6 +219,15 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* Get highest non-zero bit position of a range */ +static inline int range_get_last_bit(Range *range) +{ + if (range_is_empty(range)) { + return -1; + } + return 63 - clz64(range->upb); +} + /* * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. * Both @a and @b must not be empty. diff --git a/include/qemu/timer.h b/include/qemu/timer.h index 9a366e551fb36670112e4c21a62cfe41e9f530da..475c2a3f182220e097f912a39340e739c5f09a6c 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -91,6 +91,34 @@ struct QEMUTimer { int scale; }; +#define QEMU_USB_NORMAL_FREQ 1000 +#define QEMU_USB_LAZY_FREQ 10 +#define MAX_USB_CONTROLLER_TYPES 4 +#define QEMU_USB_CONTROLLER_OHCI 0 +#define QEMU_USB_CONTROLLER_UHCI 1 +#define QEMU_USB_CONTROLLER_EHCI 2 +#define QEMU_USB_CONTROLLER_XHCI 3 + +typedef void (*QEMUSetFreqHandler) (int freq); + +typedef struct qemu_usb_controller { + const char *name; + QEMUSetFreqHandler qemu_set_freq; +} qemu_usb_controller; + +typedef qemu_usb_controller* qemu_usb_controller_ptr; + +enum qemu_timer_mode { + QEMU_TIMER_USB_NORMAL_MODE = 1 << 0, /* Set when VNC connect or + * with usb dev passthrough + */ + QEMU_TIMER_USB_LAZY_MODE = 1 << 1, /* Set when VNC disconnect */ +}; + +int qemu_register_usb_controller(qemu_usb_controller_ptr controller, + unsigned int type); +int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type); + extern QEMUTimerListGroup main_loop_tlg; /* diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h index 18a4314212d7bf604920f554cafb2ad879ee5db7..a1979308d7e1ce1a6e07284a58af511da534562a 100644 --- a/include/qemu/userfaultfd.h +++ b/include/qemu/userfaultfd.h @@ -39,7 +39,6 @@ int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake); int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); -bool uffd_poll_events(int uffd_fd, int tmo); #endif /* CONFIG_LINUX */ diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index 72279f4d25d4b01fb876256511605e4aa78b5720..3afb70160f0bf5feaf7ae284ea19e6e51e822cf5 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -322,6 +322,8 @@ extern "C" { * index 1 = Cr:Cb plane, [39:0] Cr1:Cb1:Cr0:Cb0 little endian */ #define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') /* 2x2 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') /* 2x1 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV30 fourcc_code('N', 'V', '3', '0') /* non-subsampled Cr:Cb plane */ /* * 2 plane YCbCr MSB aligned diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h index 6b9793842c98496feff47cba2f0a49eecdfac7d6..fc0dcd10aededdb8ea0f2ddb7144dcf44854727d 100644 --- a/include/standard-headers/linux/fuse.h +++ b/include/standard-headers/linux/fuse.h @@ -209,7 +209,7 @@ * - add FUSE_HAS_EXPIRE_ONLY * * 7.39 - * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures */ @@ -405,8 +405,7 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now - * allow shared mmap + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -445,7 +444,10 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) -#define FUSE_DIRECT_IO_RELAX (1ULL << 36) +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) + +/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ +#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** * CUSE INIT request/reply flags diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index e5f558d9649396faed564c9156ab3d03b9aaa7ab..a39193213ff25ba24233fe28bc4977f38c294702 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -80,6 +80,7 @@ #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 +#define PCI_HEADER_TYPE_MFD 0x80 /* Multi-Function Device (possible) */ #define PCI_BIST 0x0f /* 8 bits */ #define PCI_BIST_CODE_MASK 0x0f /* Return result */ @@ -637,6 +638,7 @@ #define PCI_EXP_RTCAP 0x1e /* Root Capabilities */ #define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ #define PCI_EXP_RTSTA 0x20 /* Root Status */ +#define PCI_EXP_RTSTA_PME_RQ_ID 0x0000ffff /* PME Requester ID */ #define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ #define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ /* @@ -930,12 +932,13 @@ /* Process Address Space ID */ #define PCI_PASID_CAP 0x04 /* PASID feature register */ -#define PCI_PASID_CAP_EXEC 0x02 /* Exec permissions Supported */ -#define PCI_PASID_CAP_PRIV 0x04 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_EXEC 0x0002 /* Exec permissions Supported */ +#define PCI_PASID_CAP_PRIV 0x0004 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_WIDTH 0x1f00 #define PCI_PASID_CTRL 0x06 /* PASID control register */ -#define PCI_PASID_CTRL_ENABLE 0x01 /* Enable bit */ -#define PCI_PASID_CTRL_EXEC 0x02 /* Exec permissions Enable */ -#define PCI_PASID_CTRL_PRIV 0x04 /* Privilege Mode Enable */ +#define PCI_PASID_CTRL_ENABLE 0x0001 /* Enable bit */ +#define PCI_PASID_CTRL_EXEC 0x0002 /* Exec permissions Enable */ +#define PCI_PASID_CTRL_PRIV 0x0004 /* Privilege Mode Enable */ #define PCI_EXT_CAP_PASID_SIZEOF 8 /* Single Root I/O Virtualization */ @@ -975,6 +978,8 @@ #define PCI_LTR_VALUE_MASK 0x000003ff #define PCI_LTR_SCALE_MASK 0x00001c00 #define PCI_LTR_SCALE_SHIFT 10 +#define PCI_LTR_NOSNOOP_VALUE 0x03ff0000 /* Max No-Snoop Latency Value */ +#define PCI_LTR_NOSNOOP_SCALE 0x1c000000 /* Scale for Max Value */ #define PCI_EXT_CAP_LTR_SIZEOF 8 /* Access Control Service */ @@ -1042,9 +1047,16 @@ #define PCI_EXP_DPC_STATUS 0x08 /* DPC Status */ #define PCI_EXP_DPC_STATUS_TRIGGER 0x0001 /* Trigger Status */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN 0x0006 /* Trigger Reason */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR 0x0000 /* Uncorrectable error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE 0x0002 /* Rcvd ERR_NONFATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE 0x0004 /* Rcvd ERR_FATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT 0x0006 /* Reason in Trig Reason Extension field */ #define PCI_EXP_DPC_STATUS_INTERRUPT 0x0008 /* Interrupt Status */ #define PCI_EXP_DPC_RP_BUSY 0x0010 /* Root Port Busy */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO 0x0000 /* RP PIO error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER 0x0020 /* DPC SW Trigger bit */ +#define PCI_EXP_DPC_RP_PIO_FEP 0x1f00 /* RP PIO First Err Ptr */ #define PCI_EXP_DPC_SOURCE_ID 0x0A /* DPC Source Identifier */ @@ -1088,6 +1100,8 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +#define PCI_L1SS_CTL2_T_PWR_ON_SCALE 0x00000003 /* T_POWER_ON Scale */ +#define PCI_L1SS_CTL2_T_PWR_ON_VALUE 0x000000f8 /* T_POWER_ON Value */ /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ #define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h index 5ad07e134aed3525943388a18b5bded129624f58..46fc53cd83ec4d60beeff4eeca06b8ae83c807df 100644 --- a/include/standard-headers/linux/vhost_types.h +++ b/include/standard-headers/linux/vhost_types.h @@ -185,5 +185,14 @@ struct vhost_vdpa_iova_range { * DRIVER_OK */ #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 0x6 +/* Device may expose the virtqueue's descriptor area, driver area and + * device area to a different group for ASID binding than where its + * buffers may reside. Requires VHOST_BACKEND_F_IOTLB_ASID. + */ +#define VHOST_BACKEND_F_DESC_ASID 0x7 +/* IOTLB don't flush memory mapping across device reset */ +#define VHOST_BACKEND_F_IOTLB_PERSIST 0x8 +/* device can use bytemap log */ +#define VHOST_BACKEND_F_BYTEMAPLOG 0x3f #endif diff --git a/include/standard-headers/linux/virtio_config.h b/include/standard-headers/linux/virtio_config.h index 8a7d0dc8b0070343636a72926dbea66ceb01b184..bfd1ca643e7f4a62127c295bdac8ccc4da068cc0 100644 --- a/include/standard-headers/linux/virtio_config.h +++ b/include/standard-headers/linux/virtio_config.h @@ -103,6 +103,11 @@ */ #define VIRTIO_F_NOTIFICATION_DATA 38 +/* This feature indicates that the driver uses the data provided by the device + * as a virtqueue identifier in available buffer notifications. + */ +#define VIRTIO_F_NOTIF_CONFIG_DATA 39 + /* * This feature indicates that the driver can reset a queue individually. */ diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index be912cfc957cd78f0cade4c99ad03c0513e03c4d..b7fdfd066878cb89a091edcfe79c11196f3361d7 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -166,6 +166,17 @@ struct virtio_pci_common_cfg { uint32_t queue_used_hi; /* read-write */ }; +/* + * Warning: do not use sizeof on this: use offsetofend for + * specific fields you need. + */ +struct virtio_pci_modern_common_cfg { + struct virtio_pci_common_cfg cfg; + + uint16_t queue_notify_data; /* read-write */ + uint16_t queue_reset; /* read-write */ +}; + /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ struct virtio_pci_cfg_cap { struct virtio_pci_cap cap; diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h index ef91fc28bbdfa3dab181e11c5a4dc42fce1bad92..7a32e7f820325ed677db482893faa60c7d43b5d4 100644 --- a/include/sysemu/accel-ops.h +++ b/include/sysemu/accel-ops.h @@ -53,6 +53,9 @@ struct AccelOpsClass { int (*insert_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); int (*remove_breakpoint)(CPUState *cpu, int type, vaddr addr, vaddr len); void (*remove_all_breakpoints)(CPUState *cpu); + + void (*control_pre_system_reset)(void); + void (*control_post_system_reset)(void); }; #endif /* ACCEL_OPS_H */ diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h index 780cea7305aadd131e3e2974053ab33e86a00ac9..5a1cdac9c4a1040da20f79c6721cb627b7321267 100644 --- a/include/sysemu/block-backend-common.h +++ b/include/sysemu/block-backend-common.h @@ -16,6 +16,9 @@ #include "qemu/iov.h" #include "block/throttle-groups.h" +/* block backend default retry interval */ +#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL 1000 + /* * TODO Have to include block/block.h for a bunch of block layer * types. Unfortunately, this pulls in the whole BlockDriverState @@ -71,6 +74,10 @@ typedef struct BlockDevOps { * Is the device still busy? */ bool (*drained_poll)(void *opaque); + /* + * Runs when retrying failed requests. + */ + void (*retry_request_cb)(void *opaque); /* * I/O API functions. These functions are thread-safe. diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index 49c12b0fa9387f5d9be6c18408ff60592c981a50..d56592c22ed91b57c615eacbcd3192ac7dbfe305 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -84,6 +84,9 @@ int blk_commit_all(void); bool blk_in_drain(BlockBackend *blk); void blk_drain(BlockBackend *blk); void blk_drain_all(void); +void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval); +void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout); +void blk_error_retry_reset_timeout(BlockBackend *blk); void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, BlockdevOnError on_write_error); bool blk_supports_write_perm(BlockBackend *blk); diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index b4a566cfe75274f6d8f4bfe8abb9e409a4c2e440..f24d27daf5269d658e2249cb1482d88d23741bea 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -44,6 +44,8 @@ extern int icount_align_option; void qemu_cpu_kick_self(void); bool cpus_are_resettable(void); +void cpus_control_pre_system_reset(void); +void cpus_control_post_system_reset(void); void cpu_synchronize_all_states(void); void cpu_synchronize_all_post_reset(void); diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h index a1ac5bc1b5435be8ef97d39acb0bf99cfb94e032..5a49a306284d26af967254ba064a134ab68305ec 100644 --- a/include/sysemu/dma.h +++ b/include/sysemu/dma.h @@ -152,7 +152,7 @@ static inline MemTxResult dma_memory_read(AddressSpace *as, dma_addr_t addr, } /** - * address_space_write: Write to address space from DMA controller. + * dma_memory_write: Write to address space from DMA controller. * * Return a MemTxResult indicating whether the operation succeeded * or failed (eg unassigned memory, device rejected the transaction, @@ -189,7 +189,7 @@ MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr, uint8_t c, dma_addr_t len, MemTxAttrs attrs); /** - * address_space_map: Map a physical memory region into a host virtual address. + * dma_memory_map: Map a physical memory region into a host virtual address. * * May map a subset of the requested range, given by and returned in @plen. * May return %NULL and set *@plen to zero(0), if resources needed to perform @@ -216,16 +216,15 @@ static inline void *dma_memory_map(AddressSpace *as, } /** - * address_space_unmap: Unmaps a memory region previously mapped - * by dma_memory_map() + * dma_memory_unmap: Unmaps a memory region previously mapped by dma_memory_map() * * Will also mark the memory as dirty if @dir == %DMA_DIRECTION_FROM_DEVICE. * @access_len gives the amount of memory that was actually read or written * by the caller. * * @as: #AddressSpace used - * @buffer: host pointer as returned by address_space_map() - * @len: buffer length as returned by address_space_map() + * @buffer: host pointer as returned by dma_memory_map() + * @len: buffer length as returned by dma_memory_map() * @dir: indicates the transfer direction * @access_len: amount of data actually transferred */ diff --git a/include/sysemu/host_iommu_device.h b/include/sysemu/host_iommu_device.h new file mode 100644 index 0000000000000000000000000000000000000000..22c76a37a79504d7de552655545395a2a7544ed7 --- /dev/null +++ b/include/sysemu/host_iommu_device.h @@ -0,0 +1,111 @@ +/* + * Host IOMMU device abstract declaration + * + * Copyright (C) 2024 Intel Corporation. + * + * Authors: Zhenzhong Duan + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef HOST_IOMMU_DEVICE_H +#define HOST_IOMMU_DEVICE_H + +#include "qom/object.h" +#include "qapi/error.h" + +/** + * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities. + * + * @type: host platform IOMMU type. + * + * @hw_caps: host platform IOMMU capabilities (e.g. on IOMMUFD this represents + * the @out_capabilities value returned from IOMMU_GET_HW_INFO ioctl) + */ +typedef struct HostIOMMUDeviceCaps { + uint32_t type; + uint64_t hw_caps; + uint8_t max_pasid_log2; +} HostIOMMUDeviceCaps; + +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device" +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE) + +struct HostIOMMUDevice { + Object parent_obj; + + char *name; + void *agent; /* pointer to agent device, ie. VFIO or VDPA device */ + HostIOMMUDeviceCaps caps; +}; + +/** + * struct HostIOMMUDeviceClass - The base class for all host IOMMU devices. + * + * Different types of host devices (e.g., VFIO or VDPA device) or devices + * with different backend (e.g., VFIO legacy container or IOMMUFD backend) + * will have different implementations of the HostIOMMUDeviceClass. + */ +struct HostIOMMUDeviceClass { + ObjectClass parent_class; + + /** + * @realize: initialize host IOMMU device instance further. + * + * Mandatory callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @realize_late: initialize host IOMMU device instance after attachment, + * some elements e.g., ioas are ready only after attachment. + * This callback initialize them. + * + * Optional callback. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @opaque: pointer to agent device of this host IOMMU device, + * e.g., VFIO base device or VDPA device. + * + * @errp: pass an Error out when realize fails. + * + * Returns: true on success, false on failure. + */ + bool (*realize_late)(HostIOMMUDevice *hiod, void *opaque, Error **errp); + /** + * @get_cap: check if a host IOMMU device capability is supported. + * + * Optional callback, if not implemented, hint not supporting query + * of @cap. + * + * @hiod: pointer to a host IOMMU device instance. + * + * @cap: capability to check. + * + * @errp: pass an Error out when fails to query capability. + * + * Returns: <0 on failure, 0 if a @cap is unsupported, or else + * 1 or some positive value for some special @cap, + * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS. + */ + int (*get_cap)(HostIOMMUDevice *hiod, int cap, Error **errp); +}; + +/* + * Host IOMMU device capability list. + */ +#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE 0 +#define HOST_IOMMU_DEVICE_CAP_AW_BITS 1 + +#define HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX 64 +#endif diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h new file mode 100644 index 0000000000000000000000000000000000000000..908c94d81145a69293677d06ec09e42b3bc235cd --- /dev/null +++ b/include/sysemu/iommufd.h @@ -0,0 +1,144 @@ +/* + * iommufd container backend declaration + * + * Copyright (C) 2024 Intel Corporation. + * Copyright Red Hat, Inc. 2024 + * + * Authors: Yi Liu + * Eric Auger + * Zhenzhong Duan + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef SYSEMU_IOMMUFD_H +#define SYSEMU_IOMMUFD_H + +#include "qom/object.h" +#include "exec/hwaddr.h" +#include "exec/cpu-common.h" +#include "sysemu/host_iommu_device.h" + +#define TYPE_IOMMUFD_BACKEND "iommufd" +OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND) + +struct IOMMUFDBackendClass { + ObjectClass parent_class; +}; + +struct IOMMUFDBackend { + Object parent; + + /*< protected >*/ + int fd; /* /dev/iommu file descriptor */ + bool owned; /* is the /dev/iommu opened internally */ + uint32_t users; + + /*< public >*/ +}; + +typedef struct IOMMUFDViommu { + IOMMUFDBackend *iommufd; + uint32_t s2_hwpt_id; + uint32_t viommu_id; +} IOMMUFDViommu; + +int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp); +void iommufd_backend_disconnect(IOMMUFDBackend *be); + +int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id, + Error **errp); +void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id); +int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova, + ram_addr_t size, void *vaddr, bool readonly); +int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id, + hwaddr iova, ram_addr_t size); +bool iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid, + uint32_t *type, void *data, uint32_t len, + uint64_t *caps, uint8_t *max_pasid_log2, + Error **errp); +bool iommufd_backend_alloc_hwpt(IOMMUFDBackend *be, uint32_t dev_id, + uint32_t pt_id, uint32_t flags, + uint32_t data_type, uint32_t data_len, + void *data_ptr, uint32_t *out_hwpt, + uint32_t *out_fault_fd, Error **errp); +bool iommufd_backend_set_dirty_tracking(IOMMUFDBackend *be, uint32_t hwpt_id, + bool start, Error **errp); +bool iommufd_backend_get_dirty_bitmap(IOMMUFDBackend *be, uint32_t hwpt_id, + uint64_t iova, ram_addr_t size, + uint64_t page_size, uint64_t *data, + Error **errp); +int iommufd_backend_invalidate_cache(IOMMUFDBackend *be, uint32_t hwpt_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); +struct IOMMUFDViommu *iommufd_backend_alloc_viommu(IOMMUFDBackend *be, + uint32_t dev_id, + uint32_t viommu_type, + uint32_t hwpt_id); +int iommufd_viommu_invalidate_cache(IOMMUFDBackend *be, uint32_t viommu_id, + uint32_t data_type, uint32_t entry_len, + uint32_t *entry_num, void *data_ptr); + +#define TYPE_HOST_IOMMU_DEVICE_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd" +OBJECT_DECLARE_TYPE(HostIOMMUDeviceIOMMUFD, HostIOMMUDeviceIOMMUFDClass, + HOST_IOMMU_DEVICE_IOMMUFD) + +/* Abstract of host IOMMU device with iommufd backend */ +struct HostIOMMUDeviceIOMMUFD { + HostIOMMUDevice parent_obj; + + IOMMUFDBackend *iommufd; + uint32_t devid; + uint32_t ioas_id; +}; + +struct HostIOMMUDeviceIOMMUFDClass { + HostIOMMUDeviceClass parent_class; + + /** + * @attach_hwpt: attach host IOMMU device to IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @hwpt_id: ID of IOMMUFD hardware page table. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*attach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, uint32_t hwpt_id, + Error **errp); + /** + * @detach_hwpt: detach host IOMMU device from IOMMUFD hardware page table. + * VFIO and VDPA device can have different implementation. + * + * Mandatory callback. + * + * @idev: host IOMMU device backed by IOMMUFD backend. + * + * @errp: pass an Error out when attachment fails. + * + * Returns: true on success, false on failure. + */ + bool (*detach_hwpt)(HostIOMMUDeviceIOMMUFD *idev, Error **errp); +}; + +bool host_iommu_device_iommufd_attach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + uint32_t hwpt_id, Error **errp); +bool host_iommu_device_iommufd_detach_hwpt(HostIOMMUDeviceIOMMUFD *idev, + Error **errp); + +typedef struct IOMMUFDVdev { + HostIOMMUDeviceIOMMUFD *idev; + IOMMUFDViommu *viommu; + uint32_t vdev_id; + uint64_t virt_id; +} IOMMUFDVdev; + +struct IOMMUFDVdev *iommufd_backend_alloc_vdev(HostIOMMUDeviceIOMMUFD *idev, + IOMMUFDViommu *viommu, + uint64_t virt_id); +#endif diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index d6148781642107de0995af400331bbde126791b8..098257e72f08dff9155d36fe39df1a5786c2fa7c 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -19,6 +19,7 @@ #include "exec/memattrs.h" #include "qemu/accel.h" #include "qom/object.h" +#include "linux-headers/linux/kvm.h" #ifdef NEED_CPU_H # ifdef CONFIG_KVM @@ -32,6 +33,7 @@ #ifdef CONFIG_KVM_IS_POSSIBLE extern bool kvm_allowed; +extern bool virtcca_cvm_allowed; extern bool kvm_kernel_irqchip; extern bool kvm_split_irqchip; extern bool kvm_async_interrupts_allowed; @@ -42,8 +44,11 @@ extern bool kvm_gsi_routing_allowed; extern bool kvm_gsi_direct_mapping; extern bool kvm_readonly_mem_allowed; extern bool kvm_msi_use_devid; +extern bool kvm_csv3_allowed; #define kvm_enabled() (kvm_allowed) +#define virtcca_cvm_enabled() (virtcca_cvm_allowed) +#define VIRTCCA_CVM_TYPE (1UL << 8) /** * kvm_irqchip_in_kernel: * @@ -143,9 +148,25 @@ extern bool kvm_msi_use_devid; */ #define kvm_msi_devid_required() (kvm_msi_use_devid) +/** + * kvm_csv3_enabled: + * Returns: true if CSV3 feature is used for the VM. + */ +#define kvm_csv3_enabled() (kvm_csv3_allowed) + +/** + * kvm_csv3_should_set_priv_mem: + * Returns: true if we should explicitly request + * KVM_CSV3_SET_GUEST_PRIVATE_MEMORY. + */ +#define kvm_csv3_should_set_priv_mem() \ + (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM) + #else #define kvm_enabled() (0) +#define virtcca_cvm_enabled() (0) +#define VIRTCCA_CVM_TYPE (0) #define kvm_irqchip_in_kernel() (false) #define kvm_irqchip_is_split() (false) #define kvm_async_interrupts_enabled() (false) @@ -157,6 +178,8 @@ extern bool kvm_msi_use_devid; #define kvm_gsi_direct_mapping() (false) #define kvm_readonly_mem_enabled() (false) #define kvm_msi_devid_required() (false) +#define kvm_csv3_enabled() (false) +#define kvm_csv3_should_set_priv_mem() (false) #endif /* CONFIG_KVM_IS_POSSIBLE */ @@ -206,6 +229,14 @@ int kvm_has_gsi_routing(void); */ bool kvm_arm_supports_user_irq(void); +/* + * The default HDBSS size. The value ranges [0, 9]. + * Set to 0 to disable the HDBSS feature. + */ +#define DEFAULT_HDBSS_BUFFER_SIZE 0 +#define MAX_HDBSS_BUFFER_SIZE 9 + +void kvm_update_hdbss_cap(bool enable, int hdbss_buffer_size); int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); int kvm_on_sigbus(int code, void *addr); @@ -313,6 +344,31 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test); */ bool kvm_device_supported(int vmfd, uint64_t type); +/** + * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_vcpu(CPUState *cpu); + +/** + * kvm_park_vcpu - Park QEMU KVM vCPU context + * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. + * + * @returns: none + */ +void kvm_park_vcpu(CPUState *cpu); + +/** + * kvm_unpark_vcpu - unpark QEMU KVM vCPU context + * @s: KVM State + * @vcpu_id: Architecture vCPU ID of the parked vCPU + * + * @returns: KVM fd + */ +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); + /* Arch specific hooks */ extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; @@ -453,7 +509,7 @@ void kvm_init_cpu_signals(CPUState *cpu); * @return: virq (>=0) when success, errno (<0) when failed. */ int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); -int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, +int kvm_irqchip_update_msi_route(KVMRouteChange *c, int virq, MSIMessage msg, PCIDevice *dev); void kvm_irqchip_commit_routes(KVMState *s); @@ -490,6 +546,8 @@ bool kvm_kernel_irqchip_allowed(void); bool kvm_kernel_irqchip_required(void); bool kvm_kernel_irqchip_split(void); +bool kvm_smccc_filter_enabled(void); + /** * kvm_arch_irqchip_create: * @KVMState: The KVMState pointer @@ -538,4 +596,12 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); + +int kvm_load_user_data(hwaddr loader_start, hwaddr image_end, hwaddr initrd_start, hwaddr dtb_end, hwaddr ram_size, + struct kvm_numa_info *numa_info); + +#ifdef __aarch64__ +int kvm_create_shadow_device(PCIDevice *dev); +int kvm_delete_shadow_device(PCIDevice *dev); +#endif #endif diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index fd846394be104b9b56b8f4c0b7b56305ac486a9a..b2d2c5947795868a2c5e003b3f7bd45c6ef4372a 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -112,6 +112,7 @@ struct KVMState uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ bool kvm_dirty_ring_with_bitmap; + bool kvm_smccc_filter_enabled; uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */ struct KVMDirtyRingReaper reaper; NotifyVmexitOption notify_vmexit; diff --git a/include/sysemu/rtc.h b/include/sysemu/rtc.h index 0fc8ad6fdf1e38ca0c7d1488b7899797c18bb1aa..3edae762d46a64ceb1bc06b5b27235ce38a07b14 100644 --- a/include/sysemu/rtc.h +++ b/include/sysemu/rtc.h @@ -54,5 +54,7 @@ void qemu_get_timedate(struct tm *tm, time_t offset); * then this function will return 3600. */ time_t qemu_timedate_diff(struct tm *tm); - +time_t get_rtc_date_diff(void); +void set_rtc_date_diff(time_t diff); +int64_t qmp_query_rtc_date_diff(Error **errp); #endif diff --git a/include/tcg/startup.h b/include/tcg/startup.h index f71305765c0291138a2b01ab6605dfa12e0cd783..c6cb1d92a76b36ef059368f9606f28eca256f30f 100644 --- a/include/tcg/startup.h +++ b/include/tcg/startup.h @@ -45,6 +45,11 @@ void tcg_init(size_t tb_size, int splitwx, unsigned max_cpus); */ void tcg_register_thread(void); +/** + * tcg_register_thread: Unregister this thread with the TCG runtime + */ +void tcg_unregister_thread(void); + /** * tcg_prologue_init(): Generate the code for the TCG prologue * diff --git a/include/ui/rect.h b/include/ui/rect.h index 94898f92d0d4e893d4590b1e6d41bf51d07a13d9..68f05d78a8e6b0cadc80c229dc2a1fce889e1adf 100644 --- a/include/ui/rect.h +++ b/include/ui/rect.h @@ -19,7 +19,7 @@ static inline void qemu_rect_init(QemuRect *rect, uint16_t width, uint16_t height) { rect->x = x; - rect->y = x; + rect->y = y; rect->width = width; rect->height = height; } diff --git a/io/channel-tls.c b/io/channel-tls.c index 58fe1aceeeafd473dfb3fde267f18eccee6bdb65..a8ad89c3d11053c44633c68d78f56d77854225dc 100644 --- a/io/channel-tls.c +++ b/io/channel-tls.c @@ -69,37 +69,40 @@ qio_channel_tls_new_server(QIOChannel *master, const char *aclname, Error **errp) { - QIOChannelTLS *ioc; + QIOChannelTLS *tioc; + QIOChannel *ioc; - ioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS)); + tioc = QIO_CHANNEL_TLS(object_new(TYPE_QIO_CHANNEL_TLS)); + ioc = QIO_CHANNEL(tioc); - ioc->master = master; + tioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { - qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SHUTDOWN); + qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } object_ref(OBJECT(master)); - ioc->session = qcrypto_tls_session_new( + tioc->session = qcrypto_tls_session_new( creds, NULL, aclname, QCRYPTO_TLS_CREDS_ENDPOINT_SERVER, errp); - if (!ioc->session) { + if (!tioc->session) { goto error; } qcrypto_tls_session_set_callbacks( - ioc->session, + tioc->session, qio_channel_tls_write_handler, qio_channel_tls_read_handler, - ioc); + tioc); - trace_qio_channel_tls_new_server(ioc, master, creds, aclname); - return ioc; + trace_qio_channel_tls_new_server(tioc, master, creds, aclname); + return tioc; error: - object_unref(OBJECT(ioc)); + object_unref(OBJECT(tioc)); return NULL; } @@ -116,6 +119,7 @@ qio_channel_tls_new_client(QIOChannel *master, ioc = QIO_CHANNEL(tioc); tioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } diff --git a/io/channel-websock.c b/io/channel-websock.c index a12acc27cf25d312564c316469949482cb7020d6..de39f0d182d2610162c9e9a13c909b51e3d70518 100644 --- a/io/channel-websock.c +++ b/io/channel-websock.c @@ -883,6 +883,7 @@ qio_channel_websock_new_server(QIOChannel *master) ioc = QIO_CHANNEL(wioc); wioc->master = master; + ioc->follow_coroutine_ctx = master->follow_coroutine_ctx; if (qio_channel_has_feature(master, QIO_CHANNEL_FEATURE_SHUTDOWN)) { qio_channel_set_feature(ioc, QIO_CHANNEL_FEATURE_SHUTDOWN); } diff --git a/io/trace-events b/io/trace-events index 3cc5cf1efdf398b25701f3ea3aba61c1b6c6b435..79e1a19af7b70ce4870528c92b15f983066029a3 100644 --- a/io/trace-events +++ b/io/trace-events @@ -38,7 +38,7 @@ qio_channel_file_new_path(void *ioc, const char *path, int flags, int mode, int # channel-tls.c qio_channel_tls_new_client(void *ioc, void *master, void *creds, const char *hostname) "TLS new client ioc=%p master=%p creds=%p hostname=%s" -qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p acltname=%s" +qio_channel_tls_new_server(void *ioc, void *master, void *creds, const char *aclname) "TLS new client ioc=%p master=%p creds=%p aclname=%s" qio_channel_tls_handshake_start(void *ioc) "TLS handshake start ioc=%p" qio_channel_tls_handshake_pending(void *ioc, int status) "TLS handshake pending ioc=%p status=%d" qio_channel_tls_handshake_fail(void *ioc) "TLS handshake fail ioc=%p" diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 38e5957526c26a124df7d99e79a7cbfed6b25f6d..552fdcb18f290e4e7a2217ba03a8dc73a5463674 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -110,6 +110,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_TEC 8 /* VCPU TEC state as part of cvm */ struct kvm_vcpu_init { __u32 target; @@ -491,6 +492,109 @@ struct kvm_smccc_filter { #define KVM_HYPERCALL_EXIT_SMC (1U << 0) #define KVM_HYPERCALL_EXIT_16BIT (1U << 1) +/* + * Get feature ID registers userspace writable mask. + * + * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model + * Feature Register 2"): + * + * "The Feature ID space is defined as the System register space in + * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7}, + * op2=={0-7}." + * + * This covers all currently known R/O registers that indicate + * anything useful feature wise, including the ID registers. + * + * If we ever need to introduce a new range, it will be described as + * such in the range field. + */ +#define KVM_ARM_FEATURE_ID_RANGE_IDX(op0, op1, crn, crm, op2) \ + ({ \ + __u64 __op1 = (op1) & 3; \ + __op1 -= (__op1 == 3); \ + (__op1 << 6 | ((crm) & 7) << 3 | (op2)); \ + }) + +#define KVM_ARM_FEATURE_ID_RANGE 0 +#define KVM_ARM_FEATURE_ID_RANGE_SIZE (3 * 8 * 8) + +struct reg_mask_range { + __u64 addr; /* Pointer to mask array */ + __u32 range; /* Requested range */ + __u32 reserved[13]; +}; + +/* KVM_CAP_ARM_TMM on VM fd */ +#define KVM_CAP_ARM_TMM_CONFIG_CVM 0 +#define KVM_CAP_ARM_TMM_CREATE_RD 1 +#define KVM_CAP_ARM_TMM_POPULATE_CVM 2 +#define KVM_CAP_ARM_TMM_ACTIVATE_CVM 3 + +#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256 0 +#define KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512 1 + +#define KVM_CAP_ARM_TMM_RPV_SIZE 64 + +/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */ +#define KVM_CAP_ARM_TMM_CFG_RPV 0 +#define KVM_CAP_ARM_TMM_CFG_HASH_ALGO 1 +#define KVM_CAP_ARM_TMM_CFG_SVE 2 +#define KVM_CAP_ARM_TMM_CFG_DBG 3 +#define KVM_CAP_ARM_TMM_CFG_PMU 4 +#define KVM_CAP_ARM_TMM_CFG_KAE 5 + +#define KVM_ARM_TMM_MAX_KAE_VF_NUM 11 + +struct kvm_cap_arm_tmm_config_item { + __u32 cfg; + union { + /* cfg == KVM_CAP_ARM_TMM_CFG_RPV */ + struct { + __u8 rpv[KVM_CAP_ARM_TMM_RPV_SIZE]; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_HASH_ALGO */ + struct { + __u32 hash_algo; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_SVE */ + struct { + __u32 sve_vq; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_DBG */ + struct { + __u32 num_brps; + __u32 num_wrps; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_PMU */ + struct { + __u32 num_pmu_cntrs; + }; + + /* cfg == KVM_CAP_ARM_TMM_CFG_KAE */ + struct { + __u32 kae_vf_num; + __u64 sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + __u64 hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + }; + /* Fix the size of the union */ + __u8 reserved[256]; + }; +}; + +#define KVM_ARM_TMM_POPULATE_FLAGS_MEASURE (1U << 0) +struct kvm_cap_arm_tmm_populate_region_args { + __u64 populate_ipa_base1; + __u64 populate_ipa_size1; + __u64 populate_ipa_base2; + __u64 populate_ipa_size2; + __u32 flags; + __u32 reserved[3]; +}; + #endif #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index abe087c53b4b04348431716c1cb638b99d5a788a..756b013fb8324bd7a320e60cebec2ca692faa149 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr) #define __NR_getcwd 17 __SYSCALL(__NR_getcwd, sys_getcwd) #define __NR_lookup_dcookie 18 -__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) +__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall) #define __NR_eventfd2 19 __SYSCALL(__NR_eventfd2, sys_eventfd2) #define __NR_epoll_create1 20 @@ -816,15 +816,21 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) - #define __NR_cachestat 451 __SYSCALL(__NR_cachestat, sys_cachestat) - #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_map_shadow_stack 453 +__SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 457 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-loongarch/bitsperlong.h b/linux-headers/asm-loongarch/bitsperlong.h new file mode 100644 index 0000000000000000000000000000000000000000..6dc0bb0c13b29dd814f403f2fd4efb3b36be0619 --- /dev/null +++ b/linux-headers/asm-loongarch/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h new file mode 100644 index 0000000000000000000000000000000000000000..d22b19e134fb993c19e2484a2172cd5843cb4e5c --- /dev/null +++ b/linux-headers/asm-loongarch/kvm.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __UAPI_ASM_LOONGARCH_KVM_H +#define __UAPI_ASM_LOONGARCH_KVM_H + +#include + +/* + * KVM LoongArch specific structures and definitions. + * + * Some parts derived from the x86 version of this file. + */ + +#define __KVM_HAVE_READONLY_MEM +#define __KVM_HAVE_GUEST_DEBUG + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 +#define __KVM_HAVE_IRQ_LINE + +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 +/* + * for KVM_GET_REGS and KVM_SET_REGS + */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 pc; +}; + +/* + * for KVM_GET_FPU and KVM_SET_FPU + */ +struct kvm_fpu { + __u32 fcsr; + __u64 fcc; /* 8x8 */ + struct kvm_fpureg { + __u64 val64[4]; + } fpr[32]; +}; + +/* + * For LoongArch, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various + * registers. The id field is broken down as follows: + * + * bits[63..52] - As per linux/kvm.h + * bits[51..32] - Must be zero. + * bits[31..16] - Register set. + * + * Register set = 0: GP registers from kvm_regs (see definitions below). + * + * Register set = 1: CSR registers. + * + * Register set = 2: KVM specific registers (see definitions below). + * + * Register set = 3: FPU / SIMD registers (see definitions below). + * + * Other sets registers may be added in the future. Each set would + * have its own identifier in bits[31..16]. + */ + +#define KVM_REG_LOONGARCH_GPR (KVM_REG_LOONGARCH | 0x00000ULL) +#define KVM_REG_LOONGARCH_CSR (KVM_REG_LOONGARCH | 0x10000ULL) +#define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) +#define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) +#define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_LBT (KVM_REG_LOONGARCH | 0x50000ULL) +#define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) +#define KVM_CSR_IDX_MASK 0x7fff +#define KVM_CPUCFG_IDX_MASK 0x7fff + +/* + * KVM_REG_LOONGARCH_KVM - KVM specific control registers. + */ + +#define KVM_REG_LOONGARCH_COUNTER (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_VCPU_RESET (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2) +/* Debugging: Special instruction for software breakpoint */ +#define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3) + +/* LBT registers */ +#define KVM_REG_LOONGARCH_LBT_SCR0 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_LBT_SCR1 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2) +#define KVM_REG_LOONGARCH_LBT_SCR2 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3) +#define KVM_REG_LOONGARCH_LBT_SCR3 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4) +#define KVM_REG_LOONGARCH_LBT_EFLAGS (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5) +#define KVM_REG_LOONGARCH_LBT_FTOP (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6) + +#define LOONGARCH_REG_SHIFT 3 +#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) +#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) +#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) + +/* Device Control API on vm fd */ +#define KVM_LOONGARCH_VM_FEAT_CTRL 0 +#define KVM_LOONGARCH_VM_FEAT_LSX 0 +#define KVM_LOONGARCH_VM_FEAT_LASX 1 +#define KVM_LOONGARCH_VM_FEAT_X86BT 2 +#define KVM_LOONGARCH_VM_FEAT_ARMBT 3 +#define KVM_LOONGARCH_VM_FEAT_MIPSBT 4 +#define KVM_LOONGARCH_VM_FEAT_PMU 5 +#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 +#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 + +/* Device Control API on vcpu fd */ +#define KVM_LOONGARCH_VCPU_CPUCFG 0 +#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1 +#define KVM_LOONGARCH_VCPU_PVTIME_GPA 0 + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* dummy definition */ +struct kvm_sregs { +}; + +struct kvm_iocsr_entry { + __u32 addr; + __u32 pad; + __u64 data; +}; + +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 64 +#define KVM_MAX_CORES 256 + +#define KVM_DEV_LOONGARCH_IPI_GRP_REGS 0x40000001 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000002 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS 0x40000003 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE 0x2 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL 0x40000004 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED 0x3 + +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS 0x40000005 +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000006 +#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 + +#endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/linux-headers/asm-loongarch/mman.h b/linux-headers/asm-loongarch/mman.h new file mode 100644 index 0000000000000000000000000000000000000000..8eebf89f5ab17884a98543f3b37a3b710355083b --- /dev/null +++ b/linux-headers/asm-loongarch/mman.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/unistd.h b/linux-headers/asm-loongarch/unistd.h new file mode 100644 index 0000000000000000000000000000000000000000..b344b1f917153b6d70ab7d434f05598b116d3642 --- /dev/null +++ b/linux-headers/asm-loongarch/unistd.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define __ARCH_WANT_NEW_STAT +#define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 + +#include diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index 46d8500654c3c0648868c8d5d2a355acbd90dd97..994b6f008f5414d5f1cb6f6d08c5e5298bb70261 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -381,5 +381,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index c2f7ac673bb551c144d09e8e64b174995efec8c4..41dcf5877a1b80bbb01d20e8f3b77e16d9803d6d 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -357,5 +357,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 757c68f2add872eb6a2ed66aa51229f7950b4530..ae9d334d96e3444c4eb39f924f5fec183100a094 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -427,5 +427,9 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) #endif /* _ASM_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 8ef94bbac13839ce53ea6c6b79c6886f29055030..b9b23d66d7d9ae5f052f70577c38d573f3225ee9 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -434,6 +434,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 0e7ee43e884fdded1759437a3efad8da9751467d..cbb4b3e8f7c2f1fa9c572b4c37bdd99637c8925b 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -406,6 +406,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h index 992c5e407104958532d7ba930d50447ebe56fa25..60d3b21dead7d8846050d20a96ef1a0b3ad1ba20 100644 --- a/linux-headers/asm-riscv/kvm.h +++ b/linux-headers/asm-riscv/kvm.h @@ -80,6 +80,7 @@ struct kvm_riscv_csr { unsigned long sip; unsigned long satp; unsigned long scounteren; + unsigned long senvcfg; }; /* AIA CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ @@ -93,6 +94,11 @@ struct kvm_riscv_aia_csr { unsigned long iprio2h; }; +/* Smstateen CSR for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_smstateen_csr { + unsigned long sstateen0; +}; + /* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ struct kvm_riscv_timer { __u64 frequency; @@ -131,6 +137,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZICSR, KVM_RISCV_ISA_EXT_ZIFENCEI, KVM_RISCV_ISA_EXT_ZIHPM, + KVM_RISCV_ISA_EXT_SMSTATEEN, + KVM_RISCV_ISA_EXT_ZICOND, KVM_RISCV_ISA_EXT_MAX, }; @@ -148,6 +156,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_PMU, KVM_RISCV_SBI_EXT_EXPERIMENTAL, KVM_RISCV_SBI_EXT_VENDOR, + KVM_RISCV_SBI_EXT_DBCN, KVM_RISCV_SBI_EXT_MAX, }; @@ -178,10 +187,13 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_CSR (0x03 << KVM_REG_RISCV_TYPE_SHIFT) #define KVM_REG_RISCV_CSR_GENERAL (0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_AIA (0x1 << KVM_REG_RISCV_SUBTYPE_SHIFT) +#define KVM_REG_RISCV_CSR_SMSTATEEN (0x2 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_REG(name) \ (offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long)) #define KVM_REG_RISCV_CSR_AIA_REG(name) \ (offsetof(struct kvm_riscv_aia_csr, name) / sizeof(unsigned long)) +#define KVM_REG_RISCV_CSR_SMSTATEEN_REG(name) \ + (offsetof(struct kvm_riscv_smstateen_csr, name) / sizeof(unsigned long)) /* Timer registers are mapped as type 4 */ #define KVM_REG_RISCV_TIMER (0x04 << KVM_REG_RISCV_TYPE_SHIFT) diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 716fa368ca711658c7fa54fa815b370f34089fd7..c093e6d5f9111ae5be58b0702251e19ecffc98cc 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -425,5 +425,9 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index b2a11b1d139f0787b3b7201a46326b6f821278f4..114c0569a49aa5e8a50f9834e2487f0d9abc12b7 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -373,5 +373,9 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index d749ad1c24ec2dfee13a0665d781e03c235465cf..329649c377be129711ce0f9c76433ace56df4706 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -443,6 +443,10 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index cea67282ebfeadf28cb631222e535ab1ef8c520c..4583606ce68420d75bccf5c411da675f860744b3 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -366,6 +366,9 @@ #define __NR_cachestat 451 #define __NR_fchmodat2 452 #define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 5b2e79bf4c4610246dfb9962052825d0be85806d..146d74d8e4b0146d80ead022189149e27441b8a1 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -318,6 +318,9 @@ #define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450) #define __NR_cachestat (__X32_SYSCALL_BIT + 451) #define __NR_fchmodat2 (__X32_SYSCALL_BIT + 452) +#define __NR_futex_wake (__X32_SYSCALL_BIT + 454) +#define __NR_futex_wait (__X32_SYSCALL_BIT + 455) +#define __NR_futex_requeue (__X32_SYSCALL_BIT + 456) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 218bf7ac98d07c589058af8cb1156582d47423db..3e57fee01cc7ec52083e30bfe789d88f39d6751f 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -37,16 +37,22 @@ enum { IOMMUFD_CMD_BASE = 0x80, IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE, - IOMMUFD_CMD_IOAS_ALLOC, - IOMMUFD_CMD_IOAS_ALLOW_IOVAS, - IOMMUFD_CMD_IOAS_COPY, - IOMMUFD_CMD_IOAS_IOVA_RANGES, - IOMMUFD_CMD_IOAS_MAP, - IOMMUFD_CMD_IOAS_UNMAP, - IOMMUFD_CMD_OPTION, - IOMMUFD_CMD_VFIO_IOAS, - IOMMUFD_CMD_HWPT_ALLOC, - IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_IOAS_ALLOC = 0x81, + IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82, + IOMMUFD_CMD_IOAS_COPY = 0x83, + IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84, + IOMMUFD_CMD_IOAS_MAP = 0x85, + IOMMUFD_CMD_IOAS_UNMAP = 0x86, + IOMMUFD_CMD_OPTION = 0x87, + IOMMUFD_CMD_VFIO_IOAS = 0x88, + IOMMUFD_CMD_HWPT_ALLOC = 0x89, + IOMMUFD_CMD_GET_HW_INFO = 0x8a, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c, + IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d, + IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e, + IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, + IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, }; /** @@ -347,20 +353,116 @@ struct iommu_vfio_ioas { }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) +/** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment + * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is + * valid. + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, + IOMMU_HWPT_FAULT_ID_VALID = 1 << 2, +}; + +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + +/** + * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE + * (IOMMU_HWPT_DATA_ARM_SMMUV3) + * + * @ste: The first two double words of the user space Stream Table Entry for + * the translation. Must be little-endian. + * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) + * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax + * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD + * + * -EIO will be returned if @ste is not legal or contains any non-allowed field. + * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass + * nested domain will translate the same as the nesting parent. The S1 will + * install a Context Descriptor Table pointing at userspace memory translated + * by the nesting parent. + */ +struct iommu_hwpt_arm_smmuv3 { + __aligned_le64 ste[2]; +}; + +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE = 0, + IOMMU_HWPT_DATA_VTD_S1 = 1, + IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data + * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of + * IOMMU_HWPT_FAULT_ID_VALID is set. + * @__reserved2: Padding to 64-bit alignment. Must be 0. * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a + * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be + * allocated previously via the same ioctl from a given IOAS (@pt_id). In this + * case, the @data_type must be set to a pre-defined type corresponding to an + * I/O page table type supported by the underlying IOMMU hardware. The device + * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU + * instance. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -369,13 +471,28 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; + __u32 fault_id; + __u32 __reserved2; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec @@ -393,15 +510,72 @@ struct iommu_hw_info_vtd { __aligned_u64 ecap_reg; }; +/** + * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information + * (IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + * + * @flags: Must be set to 0 + * @__reserved: Must be 0 + * @idr: Implemented features for ARM SMMU Non-secure programming interface + * @iidr: Information about the implementation and implementer of ARM SMMU, + * and architecture version supported + * @aidr: ARM SMMU architecture version + * + * For the details of @idr, @iidr and @aidr, please refer to the chapters + * from 6.3.1 to 6.3.6 in the SMMUv3 Spec. + * + * User space should read the underlying ARM SMMUv3 hardware information for + * the list of supported features. + * + * Note that these values reflect the raw HW capability, without any insight if + * any required kernel driver support is present. Bits may be set indicating the + * HW has functionality that is lacking kernel software support, such as BTM. If + * a VMM is using this information to construct emulated copies of these + * registers it should only forward bits that it knows it can support. + * + * In future, presence of required kernel support will be indicated in flags. + */ +struct iommu_hw_info_arm_smmuv3 { + __u32 flags; + __u32 __reserved; + __u32 idr[6]; + __u32 iidr; + __u32 aidr; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware * info * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type + * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { - IOMMU_HW_INFO_TYPE_NONE, - IOMMU_HW_INFO_TYPE_INTEL_VTD, + IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, + IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, +}; + +/** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + * @IOMMU_HW_CAP_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. + * @IOMMU_HW_CAP_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct iommu_hw_info::out_max_pasid_log2 + * is zero. + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -415,6 +589,11 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -438,7 +617,346 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + +/** + * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation + * Data Type + * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1 + * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3 + */ +enum iommu_hwpt_invalidate_data_type { + IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d + * stage-1 cache invalidation + * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies + * to all-levels page structure cache or just + * the leaf PTE cache. + */ +enum iommu_hwpt_vtd_s1_invalidate_flags { + IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0, +}; + +/** + * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation + * (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1) + * @addr: The start address of the range to be invalidated. It needs to + * be 4KB aligned. + * @npages: Number of contiguous 4K pages to be invalidated. + * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags + * @__reserved: Must be 0 + * + * The Intel VT-d specific invalidation data for user-managed stage-1 cache + * invalidation in nested translation. Userspace uses this structure to + * tell the impacted cache scope after modifying the stage-1 page table. + * + * Invalidating all the caches related to the page table by setting @addr + * to be 0 and @npages to be U64_MAX. + * + * The device TLB will be invalidated automatically if ATS is enabled. + */ +struct iommu_hwpt_vtd_s1_invalidate { + __aligned_u64 addr; + __aligned_u64 npages; + __u32 flags; + __u32 __reserved; +}; + +/** + * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation + * (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3) + * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ. + * Must be little-endian. + * + * Supported command list only when passing in a vIOMMU via @hwpt_id: + * CMDQ_OP_TLBI_NSNH_ALL + * CMDQ_OP_TLBI_NH_VA + * CMDQ_OP_TLBI_NH_VAA + * CMDQ_OP_TLBI_NH_ALL + * CMDQ_OP_TLBI_NH_ASID + * CMDQ_OP_ATC_INV + * CMDQ_OP_CFGI_CD + * CMDQ_OP_CFGI_CD_ALL + * + * -EIO will be returned if the command is not supported. + */ +struct iommu_viommu_arm_smmuv3_invalidate { + __aligned_le64 cmd[2]; +}; + +/** + * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE) + * @size: sizeof(struct iommu_hwpt_invalidate) + * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation + * @data_uptr: User pointer to an array of driver-specific cache invalidation + * data. + * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data + * type of all the entries in the invalidation request array. It + * should be a type supported by the hwpt pointed by @hwpt_id. + * @entry_len: Length (in bytes) of a request entry in the request array + * @entry_num: Input the number of cache invalidation requests in the array. + * Output the number of requests successfully handled by kernel. + * @__reserved: Must be 0. + * + * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications + * on a user-managed page table should be followed by this operation, if a HWPT + * is passed in via @hwpt_id. Other caches, such as device cache or descriptor + * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field. + * + * Each ioctl can support one or more cache invalidation requests in the array + * that has a total size of @entry_len * @entry_num. + * + * An empty invalidation request array by setting @entry_num==0 is allowed, and + * @entry_len and @data_uptr would be ignored in this case. This can be used to + * check if the given @data_type is supported or not by kernel. + */ +struct iommu_hwpt_invalidate { + __u32 size; + __u32 hwpt_id; + __aligned_u64 data_uptr; + __u32 data_type; + __u32 entry_len; + __u32 entry_num; + __u32 __reserved; +}; +#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE) + +/** + * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is + * valid. + * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group. + */ +enum iommu_hwpt_pgfault_flags { + IOMMU_PGFAULT_FLAGS_PASID_VALID = (1 << 0), + IOMMU_PGFAULT_FLAGS_LAST_PAGE = (1 << 1), +}; + +/** + * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault + * @IOMMU_PGFAULT_PERM_READ: request for read permission + * @IOMMU_PGFAULT_PERM_WRITE: request for write permission + * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the + * Execute Requested bit set in PASID TLP Prefix. + * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the + * Privileged Mode Requested bit set in PASID TLP + * Prefix. + */ +enum iommu_hwpt_pgfault_perm { + IOMMU_PGFAULT_PERM_READ = (1 << 0), + IOMMU_PGFAULT_PERM_WRITE = (1 << 1), + IOMMU_PGFAULT_PERM_EXEC = (1 << 2), + IOMMU_PGFAULT_PERM_PRIV = (1 << 3), +}; + +/** + * struct iommu_hwpt_pgfault - iommu page fault data + * @flags: Combination of enum iommu_hwpt_pgfault_flags + * @dev_id: id of the originated device + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @addr: Fault address + * @length: a hint of how much data the requestor is expecting to fetch. For + * example, if the PRI initiator knows it is going to do a 10MB + * transfer, it could fill in 10MB and the OS could pre-fault in + * 10MB of IOVA. It's default to 0 if there's no such hint. + * @cookie: kernel-managed cookie identifying a group of fault messages. The + * cookie number encoded in the last page fault of the group should + * be echoed back in the response message. + */ +struct iommu_hwpt_pgfault { + __u32 flags; + __u32 dev_id; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u32 length; + __u32 cookie; +}; + +/** + * enum iommufd_page_response_code - Return status of fault handlers + * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is the + * "Success" defined in PCI 10.4.2.1. + * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is the "Invalid Request" in PCI + * 10.4.2.1. + */ +enum iommufd_page_response_code { + IOMMUFD_PAGE_RESP_SUCCESS = 0, + IOMMUFD_PAGE_RESP_INVALID = 1, +}; + +/** + * struct iommu_hwpt_page_response - IOMMU page fault response + * @cookie: The kernel-managed cookie reported in the fault message. + * @code: One of response code in enum iommufd_page_response_code. + */ +struct iommu_hwpt_page_response { + __u32 cookie; + __u32 code; +}; + +/** + * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC) + * @size: sizeof(struct iommu_fault_alloc) + * @flags: Must be 0 + * @out_fault_id: The ID of the new FAULT + * @out_fault_fd: The fd of the new FAULT + * + * Explicitly allocate a fault handling object. + */ +struct iommu_fault_alloc { + __u32 size; + __u32 flags; + __u32 out_fault_id; + __u32 out_fault_fd; +}; +#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC) + +/** + * enum iommu_viommu_type - Virtual IOMMU Type + * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + */ +enum iommu_viommu_type { + IOMMU_VIOMMU_TYPE_DEFAULT = 0, + IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC) + * @size: sizeof(struct iommu_viommu_alloc) + * @flags: Must be 0 + * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type + * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU + * @hwpt_id: ID of a nesting parent HWPT to associate to + * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * + * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's + * virtualization support that is a security-isolated slice of the real IOMMU HW + * that is unique to a specific VM. Operations global to the IOMMU are connected + * to the vIOMMU, such as: + * - Security namespace for guest owned ID, e.g. guest-controlled cache tags + * - Non-device-affiliated event reporting, e.g. invalidation queue errors + * - Access to a sharable nesting parent pagetable across physical IOMMUs + * - Virtualization of various platforms IDs, e.g. RIDs and others + * - Delivery of paravirtualized invalidation + * - Direct assigned invalidation queues + * - Direct assigned interrupts + */ +struct iommu_viommu_alloc { + __u32 size; + __u32 flags; + __u32 type; + __u32 dev_id; + __u32 hwpt_id; + __u32 out_viommu_id; +}; +#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) + +/** + * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC) + * @size: sizeof(struct iommu_vdevice_alloc) + * @viommu_id: vIOMMU ID to associate with the virtual device + * @dev_id: The physical device to allocate a virtual instance on the vIOMMU + * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY + * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID + * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * + * Allocate a virtual device instance (for a physical device) against a vIOMMU. + * This instance holds the device's information (related to its vIOMMU) in a VM. + */ +struct iommu_vdevice_alloc { + __u32 size; + __u32 viommu_id; + __u32 dev_id; + __u32 out_vdevice_id; + __aligned_u64 virt_id; +}; +#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC) #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 0d74ee999aa9e1fbd4e62ee6ef28e759e34b5adc..d3bf7fac004bde381bf0ed20947b5a8ffa9dc8b0 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -14,6 +14,8 @@ #include #include +#include "sysemu/numa.h" + #define KVM_API_VERSION 12 /* *** Deprecated interfaces *** */ @@ -264,6 +266,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_SBI 35 #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 +#define KVM_EXIT_LOONGARCH_IOCSR 38 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -336,8 +339,16 @@ struct kvm_run { __u32 len; __u8 is_write; } mmio; + /* KVM_EXIT_LOONGARCH_IOCSR */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } iocsr_io; /* KVM_EXIT_HYPERCALL */ struct { +#define KVM_HC_MAP_GPA_RANGE 12 __u64 nr; __u64 args[6]; __u64 ret; @@ -808,6 +819,8 @@ enum { kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_virtio_ccw_notify, kvm_ioeventfd_flag_nr_fast_mmio, + kvm_ioeventfd_flag_nr_batch_begin, + kvm_ioeventfd_flag_nr_batch_end, kvm_ioeventfd_flag_nr_max, }; @@ -816,6 +829,10 @@ enum { #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) +#define KVM_IOEVENTFD_FLAG_BATCH_BEGIN \ + (1<< kvm_ioeventfd_flag_nr_batch_begin) +#define KVM_IOEVENTFD_FLAG_BATCH_END \ + (1 << kvm_ioeventfd_flag_nr_batch_end) #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) @@ -1188,6 +1205,25 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 + +#define KVM_CAP_ARM_TMM 300 + +#define KVM_CAP_SEV_ES_GHCB 500 +#define KVM_CAP_HYGON_COCO_EXT 501 +/* support userspace to request firmware to build CSV3 guest's memory space */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_SET_PRIV_MEM (1 << 0) +/* support request to update CSV3 guest's memory region multiple times */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA (1 << 1) +/* support request to inject secret to CSV3 guest */ +#define KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET (1 << 2) + +#define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502 + +#define KVM_CAP_ARM_HISI_IPIV 798 +#define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 + +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #ifdef KVM_CAP_IRQ_ROUTING @@ -1358,6 +1394,7 @@ struct kvm_dirty_tlb { #define KVM_REG_ARM64 0x6000000000000000ULL #define KVM_REG_MIPS 0x7000000000000000ULL #define KVM_REG_RISCV 0x8000000000000000ULL +#define KVM_REG_LOONGARCH 0x9000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL @@ -1449,6 +1486,13 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_LOONGARCH_IPI, +#define KVM_DEV_TYPE_LOONGARCH_IPI KVM_DEV_TYPE_LOONGARCH_IPI + KVM_DEV_TYPE_LOONGARCH_EIOINTC, +#define KVM_DEV_TYPE_LOONGARCH_EIOINTC KVM_DEV_TYPE_LOONGARCH_EIOINTC + KVM_DEV_TYPE_LOONGARCH_PCHPIC, +#define KVM_DEV_TYPE_LOONGARCH_PCHPIC KVM_DEV_TYPE_LOONGARCH_PCHPIC + KVM_DEV_TYPE_MAX, }; @@ -1457,6 +1501,38 @@ struct kvm_vfio_spapr_tce { __s32 tablefd; }; +#define MAX_NUMA_NODE 8 +#define MAX_CPU_BIT_MAP 4 +#define MAX_NODE_BIT_MAP (MAX_NODES / BITS_PER_LONG) + +struct kvm_numa_node { + __u64 numa_id; + __u64 ipa_start; + __u64 ipa_size; + __u64 host_numa_nodes[MAX_NODE_BIT_MAP]; + __u64 cpu_id[MAX_CPU_BIT_MAP]; +}; + +struct kvm_numa_info { + __u64 numa_cnt; + struct kvm_numa_node numa_nodes[MAX_NUMA_NODE]; +}; + +struct kvm_user_data { + __u64 loader_start; + /* + * When the lowest bit of dtb_info is 0, the value of dtb_info represents the size of the DTB, + * and data_start and data_size represent the address base and size of the MMIO. + * When the lowest bit of dtb_info is 1, data_start and data_size represent the address base + * and size of the DTB. + */ + __u64 dtb_info; + __u64 data_start; + __u64 data_size; + __u64 ram_size; + struct kvm_numa_info numa_info; +}; + /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -1469,7 +1545,7 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) - +#define KVM_LOAD_USER_DATA _IOW(KVMIO, 0x49, struct kvm_user_data) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { __u64 user_addr; @@ -1514,6 +1590,17 @@ struct kvm_s390_ucas_mapping { #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) + +#ifdef __aarch64__ +struct kvm_master_dev_info +{ + __u32 nvectors; /* number of msi vectors */ + struct kvm_msi msi[0]; +}; +#define KVM_CREATE_SHADOW_DEV _IOW(KVMIO, 0xf0, struct kvm_master_dev_info) +#define KVM_DEL_SHADOW_DEV _IOW(KVMIO, 0xf1, __u32) +#endif + /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -1558,6 +1645,7 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) /* Available with KVM_CAP_COUNTER_OFFSET */ #define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset) +#define KVM_ARM_GET_REG_WRITABLE_MASKS _IOR(KVMIO, 0xb6, struct reg_mask_range) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) @@ -1567,6 +1655,10 @@ struct kvm_s390_ucas_mapping { #define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr) #define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr) +/* ioctls for control vcpu setup during system reset */ +#define KVM_CONTROL_VCPU_PRE_SYSTEM_RESET _IO(KVMIO, 0xe8) +#define KVM_CONTROL_VCPU_POST_SYSTEM_RESET _IO(KVMIO, 0xe9) + /* * ioctls for vcpu fds */ @@ -1914,6 +2006,9 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* Hygon CSV batch command */ + KVM_CSV_COMMAND_BATCH = 0x18, + KVM_SEV_NR_MAX, }; @@ -1992,6 +2087,14 @@ struct kvm_sev_send_update_data { __u32 trans_len; }; +struct kvm_sev_send_update_vmsa { + __u32 vcpu_id; + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + struct kvm_sev_receive_start { __u32 handle; __u32 policy; @@ -2010,6 +2113,98 @@ struct kvm_sev_receive_update_data { __u32 trans_len; }; +struct kvm_sev_receive_update_vmsa { + __u32 vcpu_id; + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv_batch_list_node { + __u64 cmd_data_addr; + __u64 addr; + __u64 next_cmd_addr; +}; + +struct kvm_csv_command_batch { + __u32 command_id; + __u64 csv_batch_list_uaddr; +}; + +struct kvm_csv_init { + __u64 userid_addr; + __u32 len; +}; + +/* CSV3 command */ +enum csv3_cmd_id { + KVM_CSV3_NR_MIN = 0xc0, + + KVM_CSV3_INIT = KVM_CSV3_NR_MIN, + KVM_CSV3_LAUNCH_ENCRYPT_DATA, + KVM_CSV3_LAUNCH_ENCRYPT_VMCB, + KVM_CSV3_SEND_ENCRYPT_DATA, + KVM_CSV3_SEND_ENCRYPT_CONTEXT, + KVM_CSV3_RECEIVE_ENCRYPT_DATA, + KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, + KVM_CSV3_HANDLE_MEMORY, + + KVM_CSV3_SET_GUEST_PRIVATE_MEMORY = 0xc8, + + KVM_CSV3_NR_MAX, +}; + +struct kvm_csv3_launch_encrypt_data { + __u64 gpa; + __u64 uaddr; + __u32 len; +}; + +struct kvm_csv3_init_data { + __u64 nodemask; +}; + +struct kvm_csv3_send_encrypt_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_addr_data; + __u32 guest_addr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_send_encrypt_context { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_receive_encrypt_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_addr_data; + __u32 guest_addr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_csv3_receive_encrypt_context { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +#define KVM_CSV3_RELEASE_SHARED_MEMORY (0x0001) + +struct kvm_csv3_handle_memory { + __u64 gpa; + __u32 num_pages; + __u32 opcode; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) @@ -2252,4 +2447,8 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* get tmi version */ +#define KVM_GET_TMI_VERSION _IOR(KVMIO, 0xd2, uint64_t) +#define MIN_TMI_VERSION_FOR_UEFI_BOOTED_CVM 0x20001 + #endif /* __LINUX_KVM_H */ diff --git a/linux-headers/linux/psp-sev.h b/linux-headers/linux/psp-sev.h index 12ccb70099d417a8870fe38a73dd762a8ab3855c..bcb21339ee39fae6739e70c27e98b2d336dcc253 100644 --- a/linux-headers/linux/psp-sev.h +++ b/linux-headers/linux/psp-sev.h @@ -68,6 +68,7 @@ typedef enum { SEV_RET_INVALID_PARAM, SEV_RET_RESOURCE_LIMIT, SEV_RET_SECURE_DATA_INVALID, + SEV_RET_INVALID_KEY = 0x27, SEV_RET_MAX, } sev_ret_code; diff --git a/linux-headers/linux/stddef.h b/linux-headers/linux/stddef.h index 9bb07083ac89e7ea6d51903e19804cd403435978..bf9749dd1422607dd57e1f64b8a39f0b32ab0ede 100644 --- a/linux-headers/linux/stddef.h +++ b/linux-headers/linux/stddef.h @@ -27,8 +27,13 @@ union { \ struct { MEMBERS } ATTRS; \ struct TAG { MEMBERS } ATTRS NAME; \ - } + } ATTRS +#ifdef __cplusplus +/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */ +#define __DECLARE_FLEX_ARRAY(T, member) \ + T member[0] +#else /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union * @@ -49,3 +54,5 @@ #ifndef __counted_by #define __counted_by(m) #endif + +#endif /* _LINUX_STDDEF_H */ diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h index 59978fbaae3379174fe21a00ce668b2db5eba302..953c75fedae9eac851fed94cac653c29aa493148 100644 --- a/linux-headers/linux/userfaultfd.h +++ b/linux-headers/linux/userfaultfd.h @@ -40,7 +40,8 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -216,6 +217,11 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +238,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) __u64 features; __u64 ioctls; diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index acf72b4999fa75dfcd4c49426c9fdbc135251248..5b1e2871affae86f15b71e3fb84f73ba0015346e 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -25,6 +25,7 @@ #define VFIO_TYPE1_IOMMU 1 #define VFIO_SPAPR_TCE_IOMMU 2 #define VFIO_TYPE1v2_IOMMU 3 +#define VFIO_TYPE1v2_S_IOMMU 12 /* * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This * capability is subject to change as groups are added or removed. @@ -56,6 +57,16 @@ */ #define VFIO_UPDATE_VADDR 10 +/* + * The vfio_iommu driver may support user clears dirty log manually, which means + * dirty log can be requested to not cleared automatically after dirty log is + * copied to userspace, it's user's duty to clear dirty log. + * + * Note: please refer to VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR and + * VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP. + */ +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between @@ -214,6 +225,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +#define VFIO_DEVICE_FLAGS_SECURE (1 << 15) /* secure pci device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ @@ -277,8 +289,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ - __u64 size; /* Region size (bytes) */ - __u64 offset; /* Region offset from start of device fd */ + __aligned_u64 size; /* Region size (bytes) */ + __aligned_u64 offset; /* Region offset from start of device fd */ }; #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) @@ -294,8 +306,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 struct vfio_region_sparse_mmap_area { - __u64 offset; /* Offset of mmap'able area within region */ - __u64 size; /* Size of mmap'able area */ + __aligned_u64 offset; /* Offset of mmap'able area within region */ + __aligned_u64 size; /* Size of mmap'able area */ }; struct vfio_region_info_cap_sparse_mmap { @@ -450,9 +462,9 @@ struct vfio_device_migration_info { VFIO_DEVICE_STATE_V1_RESUMING) __u32 reserved; - __u64 pending_bytes; - __u64 data_offset; - __u64 data_size; + __aligned_u64 pending_bytes; + __aligned_u64 data_offset; + __aligned_u64 data_size; }; /* @@ -476,7 +488,7 @@ struct vfio_device_migration_info { struct vfio_region_info_cap_nvlink2_ssatgt { struct vfio_info_cap_header header; - __u64 tgt; + __aligned_u64 tgt; }; /* @@ -816,7 +828,7 @@ struct vfio_device_gfx_plane_info { __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ /* out */ __u32 drm_format; /* drm format of plane */ - __u64 drm_format_mod; /* tiled mode */ + __aligned_u64 drm_format_mod; /* tiled mode */ __u32 width; /* width of plane */ __u32 height; /* height of plane */ __u32 stride; /* stride of plane */ @@ -829,6 +841,7 @@ struct vfio_device_gfx_plane_info { __u32 region_index; /* region index */ __u32 dmabuf_id; /* dma-buf id */ }; + __u32 reserved; }; #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) @@ -863,9 +876,10 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ #define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) - __u64 offset; /* device fd offset of write */ - __u64 data; /* data to be written */ + __aligned_u64 offset; /* device fd offset of write */ + __aligned_u64 data; /* data to be written */ __s32 fd; /* -1 for de-assignment */ + __u32 reserved; }; #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) @@ -1434,6 +1448,27 @@ struct vfio_device_feature_mig_data_size { #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 +/** + * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device + * based on the operation specified in op flag. + * + * The functionality is incorporated for devices that needs bus master control, + * but the in-band device interface lacks the support. Consequently, it is not + * applicable to PCI devices, as bus master control for PCI devices is managed + * in-band through the configuration space. At present, this feature is supported + * only for CDX devices. + * When the device's BUS MASTER setting is configured as CLEAR, it will result in + * blocking all incoming DMA requests from the device. On the other hand, configuring + * the device's BUS MASTER setting as SET (enable) will grant the device the + * capability to perform DMA to the host memory. + */ +struct vfio_device_feature_bus_master { + __u32 op; +#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */ +#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */ +}; +#define VFIO_DEVICE_FEATURE_BUS_MASTER 10 + /* -------- API for Type1 VFIO IOMMU -------- */ /** @@ -1449,7 +1484,7 @@ struct vfio_iommu_type1_info { __u32 flags; #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ - __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ __u32 pad; }; @@ -1628,8 +1663,30 @@ struct vfio_iommu_type1_dma_unmap { * actual bitmap. If dirty pages logging is not enabled, an error will be * returned. * - * Only one of the flags _START, _STOP and _GET may be specified at a time. + * The VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR flag is almost same as + * VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP, except that it requires underlying + * dirty bitmap is not cleared automatically. The user can clear it manually by + * calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set. * + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap + * for IOMMU container for a given IOVA range. The user must specify the IOVA + * range, the bitmap and the pgsize through the structure + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface + * supports clearing a bitmap of the smallest supported pgsize only and can be + * modified in future to clear a bitmap of any specified supported pgsize. The + * user must provide a memory area for the bitmap memory and specify its size + * in bitmap.size. One bit is used to represent one page consecutively starting + * from iova offset. The user should provide page size in bitmap.pgsize field. + * A bit set in the bitmap indicates that the page at that offset from iova is + * cleared the dirty status, and dirty tracking is re-enabled for that page. The + * caller must set argsz to a value including the size of structure + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If + * dirty pages logging is not enabled, an error will be returned. Note: user + * should clear dirty log before handle corresponding dirty pages. + * + * Only one of the flags _START, _STOP, _GET, _GET_NOCLEAR_, and _CLEAR may be + * specified at a time. */ struct vfio_iommu_type1_dirty_bitmap { __u32 argsz; @@ -1637,6 +1694,8 @@ struct vfio_iommu_type1_dirty_bitmap { #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP_NOCLEAR (1 << 3) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 4) __u8 data[]; }; diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index f5c48b61ab62244104bbf1b2100d3db7286f8c82..f5c05abe8b0a5bba7857318437c375a12014bda3 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -43,6 +43,10 @@ * The bit is set using an atomic 32 bit operation. */ /* Set base address for logging. */ #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +/* Set buffer size for logging */ +#define VHOST_SET_LOG_SIZE _IOW(VHOST_VIRTIO, 0x05, __u64) +/* Logging sync */ +#define VHOST_LOG_SYNC _IO(VHOST_VIRTIO, 0x06) /* Specify an eventfd file descriptor to signal on log write. */ #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) /* By default, a device gets one vhost_worker that its virtqueues share. This @@ -219,4 +223,49 @@ */ #define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) +/* Get the group for the descriptor table including driver & device areas + * of a virtqueue: read index, write group in num. + * The virtqueue index is stored in the index field of vhost_vring_state. + * The group ID of the descriptor table for this specific virtqueue + * is returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ + struct vhost_vring_state) + +/* Bind a vDPA device to the specified iommufd + * + * After the return of this ioctl, the vDPA device is binded to the specified + * iommufd, and the device id is also returned. + */ +#define VHOST_VDPA_BIND_IOMMUFD _IO(VHOST_VIRTIO, 0x90) + +/* Unbind a vDPA device from the specified iommufd + * + * After the return of this ioctl, the vDPA device is unbinded from the specified + * iommufd. + */ +#define VHOST_VDPA_UNBIND_IOMMUFD _IO(VHOST_VIRTIO, 0x91) + +/* Associate the vDPA device with an address space within the bound iommufd + * + * After the return of this ioctl, the vDPA device is attached to the bound + * iommufd. + */ +#define VHOST_VDPA_ATTACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x92) + +/* Detach the vDPA device from an address space within the bound iommufd. + * + * After the return of this ioctl, the vDPA device is detached from the address + * space within the bound iommufd. + */ +#define VHOST_VDPA_DETACH_IOMMUFD_PT _IO(VHOST_VIRTIO, 0x93) + +/* set and get device buffer */ +#define VHOST_GET_DEV_BUFFER _IOR(VHOST_VIRTIO, 0xb0, struct vhost_vdpa_config) +#define VHOST_SET_DEV_BUFFER _IOW(VHOST_VIRTIO, 0xb1, struct vhost_vdpa_config) +#define VHOST_GET_DEV_BUFFER_SIZE _IOR(VHOST_VIRTIO, 0xb3, __u32) + +/* set device migtration state */ +#define VHOST_VDPA_SET_MIG_STATE _IOW(VHOST_VIRTIO, 0xb2, __u8) + #endif diff --git a/linux-user/elfload.c b/linux-user/elfload.c index cf9e74468b11c7dd35dc6e6c3c57ef9d6336d01d..87eb318b46eb850646892e998bb116700a75459c 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -2980,7 +2980,7 @@ static uintptr_t pgb_try_itree(const PGBAddrs *ga, uintptr_t base, static uintptr_t pgb_find_itree(const PGBAddrs *ga, IntervalTreeRoot *root, uintptr_t align, uintptr_t brk) { - uintptr_t last = mmap_min_addr; + uintptr_t last = sizeof(uintptr_t) == 4 ? MiB : GiB; uintptr_t base, skip; while (true) { @@ -3263,7 +3263,8 @@ static void load_elf_image(const char *image_name, const ImageSource *src, char **pinterp_name) { g_autofree struct elf_phdr *phdr = NULL; - abi_ulong load_addr, load_bias, loaddr, hiaddr, error; + abi_ulong load_addr, load_bias, loaddr, hiaddr, error, align; + size_t reserve_size, align_size; int i, prot_exec; Error *err = NULL; @@ -3347,6 +3348,9 @@ static void load_elf_image(const char *image_name, const ImageSource *src, load_addr = loaddr; + align = pow2ceil(info->alignment); + info->alignment = align; + if (pinterp_name != NULL) { if (ehdr->e_type == ET_EXEC) { /* @@ -3355,8 +3359,6 @@ static void load_elf_image(const char *image_name, const ImageSource *src, */ probe_guest_base(image_name, loaddr, hiaddr); } else { - abi_ulong align; - /* * The binary is dynamic, but we still need to * select guest_base. In this case we pass a size. @@ -3374,10 +3376,7 @@ static void load_elf_image(const char *image_name, const ImageSource *src, * Since we do not have complete control over the guest * address space, we prefer the kernel to choose some address * rather than force the use of LOAD_ADDR via MAP_FIXED. - * But without MAP_FIXED we cannot guarantee alignment, - * only suggest it. */ - align = pow2ceil(info->alignment); if (align) { load_addr &= -align; } @@ -3401,13 +3400,35 @@ static void load_elf_image(const char *image_name, const ImageSource *src, * In both cases, we will overwrite pages in this range with mappings * from the executable. */ - load_addr = target_mmap(load_addr, (size_t)hiaddr - loaddr + 1, PROT_NONE, + reserve_size = (size_t)hiaddr - loaddr + 1; + align_size = reserve_size; + + if (ehdr->e_type != ET_EXEC && align > qemu_real_host_page_size()) { + align_size += align - 1; + } + + load_addr = target_mmap(load_addr, align_size, PROT_NONE, MAP_PRIVATE | MAP_ANON | MAP_NORESERVE | (ehdr->e_type == ET_EXEC ? MAP_FIXED_NOREPLACE : 0), -1, 0); if (load_addr == -1) { goto exit_mmap; } + + if (align_size != reserve_size) { + abi_ulong align_addr = ROUND_UP(load_addr, align); + abi_ulong align_end = align_addr + reserve_size; + abi_ulong load_end = load_addr + align_size; + + if (align_addr != load_addr) { + target_munmap(load_addr, align_addr - load_addr); + } + if (align_end != load_end) { + target_munmap(align_end, load_end - align_end); + } + load_addr = align_addr; + } + load_bias = load_addr - loaddr; if (elf_is_fdpic(ehdr)) { diff --git a/linux-user/syscall.c b/linux-user/syscall.c index e384e14248901c686a3aa747c5f0f2620382b8b6..513996e6fa26a226c7c907d381d42ea2ee909654 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -53,7 +53,6 @@ #include #include #include -//#include #include #include #include diff --git a/meson.build b/meson.build index 6c77d9687ded405fcd6cd8b231929663644c890a..d221f5cad53ec5864b35a8b4132d46df4d3c8b57 100644 --- a/meson.build +++ b/meson.build @@ -114,6 +114,8 @@ elif cpu in ['riscv32'] kvm_targets = ['riscv32-softmmu'] elif cpu in ['riscv64'] kvm_targets = ['riscv64-softmmu'] +elif cpu in ['loongarch64'] + kvm_targets = ['loongarch64-softmmu'] else kvm_targets = [] endif @@ -1041,6 +1043,32 @@ if not get_option('zstd').auto() or have_block required: get_option('zstd'), method: 'pkg-config') endif +qpl = not_found +if not get_option('qpl').auto() or have_system + qpl = dependency('qpl', version: '>=1.5.0', + required: get_option('qpl'), + method: 'pkg-config') +endif +uadk = not_found +if not get_option('uadk').auto() or have_system + libwd = dependency('libwd', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + libwd_comp = dependency('libwd_comp', version: '>=2.6', + required: get_option('uadk'), + method: 'pkg-config') + if libwd.found() and libwd_comp.found() + uadk = declare_dependency(dependencies: [libwd, libwd_comp]) + endif +endif + +qatzip = not_found +if not get_option('qatzip').auto() or have_system + qatzip = dependency('qatzip', version: '>=1.1.2', + required: get_option('qatzip'), + method: 'pkg-config') +endif + virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -1137,7 +1165,7 @@ iconv = not_found curses = not_found if have_system and get_option('curses').allowed() curses_test = ''' - #if defined(__APPLE__) || defined(__OpenBSD__) + #ifdef __APPLE__ #define _XOPEN_SOURCE_EXTENDED 1 #endif #include @@ -1483,6 +1511,8 @@ endif gcrypt = not_found nettle = not_found hogweed = not_found +crypto_sm4 = not_found +crypto_sm3 = not_found xts = 'none' if get_option('nettle').enabled() and get_option('gcrypt').enabled() @@ -1508,6 +1538,28 @@ if not gnutls_crypto.found() cc.find_library('gpg-error', required: true)], version: gcrypt.version()) endif + crypto_sm4 = gcrypt + # SM4 ALG is available in libgcrypt >= 1.9 + if gcrypt.found() and not cc.links(''' + #include + int main(void) { + gcry_cipher_hd_t handler; + gcry_cipher_open(&handler, GCRY_CIPHER_SM4, GCRY_CIPHER_MODE_ECB, 0); + return 0; + }''', dependencies: gcrypt) + crypto_sm4 = not_found + endif + crypto_sm3 = gcrypt + # SM3 ALG is available in libgcrypt >= 1.8 + if gcrypt.found() and not cc.links(''' + #include + int main(void) { + gcry_md_hd_t handler; + gcry_md_open(&handler, GCRY_MD_SM3, 0); + return 0; + }''', dependencies: gcrypt) + crypto_sm3 = not_found + endif endif if (not get_option('nettle').auto() or have_system) and not gcrypt.found() nettle = dependency('nettle', version: '>=3.4', @@ -1516,6 +1568,43 @@ if not gnutls_crypto.found() if nettle.found() and not cc.has_header('nettle/xts.h', dependencies: nettle) xts = 'private' endif + crypto_sm4 = nettle + # SM4 ALG is available in nettle >= 3.9 + if nettle.found() and not cc.links(''' + #include + int main(void) { + struct sm4_ctx ctx; + unsigned char key[16] = {0}; + sm4_set_encrypt_key(&ctx, key); + return 0; + }''', dependencies: nettle) + crypto_sm4 = not_found + endif + crypto_sm3 = nettle + # SM3 ALG is available in nettle >= 3.4 + if nettle.found() and not cc.links(''' + #include + #include + int main(void) { + struct sm3_ctx ctx; + struct hmac_sm3_ctx hmac_ctx; + unsigned char data[64] = {0}; + unsigned char output[32]; + + // SM3 hash function test + sm3_init(&ctx); + sm3_update(&ctx, 64, data); + sm3_digest(&ctx, 32, data); + + // HMAC-SM3 test + hmac_sm3_set_key(&hmac_ctx, 32, data); + hmac_sm3_update(&hmac_ctx, 64, data); + hmac_sm3_digest(&hmac_ctx, 32, output); + + return 0; + }''', dependencies: nettle) + crypto_sm3 = not_found + endif endif endif @@ -1909,8 +1998,14 @@ endif # libxdp libxdp = not_found if not get_option('af_xdp').auto() or have_system - libxdp = dependency('libxdp', required: get_option('af_xdp'), - version: '>=1.4.0', method: 'pkg-config') + if libbpf.found() + libxdp = dependency('libxdp', required: get_option('af_xdp'), + version: '>=1.4.0', method: 'pkg-config') + else + if get_option('af_xdp').enabled() + error('libxdp requested, but libbpf is not available') + endif + endif endif # libdw @@ -2118,6 +2213,8 @@ config_host_data.set('CONFIG_BLKIO', blkio.found()) if blkio.found() config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD', blkio.version().version_compare('>=1.3.0')) + config_host_data.set('CONFIG_BLKIO_WRITE_ZEROS_FUA', + blkio.version().version_compare('>=1.4.0')) endif config_host_data.set('CONFIG_CURL', curl.found()) config_host_data.set('CONFIG_CURSES', curses.found()) @@ -2202,12 +2299,17 @@ config_host_data.set('CONFIG_GNUTLS_CRYPTO', gnutls_crypto.found()) config_host_data.set('CONFIG_TASN1', tasn1.found()) config_host_data.set('CONFIG_GCRYPT', gcrypt.found()) config_host_data.set('CONFIG_NETTLE', nettle.found()) +config_host_data.set('CONFIG_CRYPTO_SM4', crypto_sm4.found()) +config_host_data.set('CONFIG_CRYPTO_SM3', crypto_sm3.found()) config_host_data.set('CONFIG_HOGWEED', hogweed.found()) config_host_data.set('CONFIG_QEMU_PRIVATE_XTS', xts == 'private') config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) +config_host_data.set('CONFIG_QPL', qpl.found()) +config_host_data.set('CONFIG_UADK', uadk.found()) +config_host_data.set('CONFIG_QATZIP', qatzip.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -3358,6 +3460,7 @@ if have_system or have_user 'target/hppa', 'target/i386', 'target/i386/kvm', + 'target/loongarch', 'target/mips/tcg', 'target/nios2', 'target/ppc', @@ -4277,6 +4380,8 @@ summary_info += {'nettle': nettle} if nettle.found() summary_info += {' XTS': xts != 'private'} endif +summary_info += {'SM4 ALG support': crypto_sm4} +summary_info += {'SM3 ALG support': crypto_sm3} summary_info += {'AF_ALG support': have_afalg} summary_info += {'rng-none': get_option('rng_none')} summary_info += {'Linux keyring': have_keyring} @@ -4379,6 +4484,9 @@ summary_info += {'snappy support': snappy} summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} +summary_info += {'Query Processing Library support': qpl} +summary_info += {'UADK Library support': uadk} +summary_info += {'qatzip support': qatzip} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index c9baeda6395634c3478a3c2a3a8c8f57fbe0b592..61996300d57ee34c0a9843d1100b6309bf5bd260 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -121,7 +121,7 @@ option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') option('avx512bw', type: 'feature', value: 'auto', description: 'AVX512BW optimizations') -option('keyring', type: 'feature', value: 'auto', +option('keyring', type: 'feature', value: 'disabled', description: 'Linux keyring support') option('libkeyutils', type: 'feature', value: 'auto', description: 'Linux keyutils support') @@ -259,6 +259,12 @@ option('xkbcommon', type : 'feature', value : 'auto', description: 'xkbcommon support') option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') +option('qpl', type : 'feature', value : 'auto', + description: 'Query Processing Library support') +option('uadk', type : 'feature', value : 'auto', + description: 'UADK Library support') +option('qatzip', type: 'feature', value: 'auto', + description: 'QATzip compression support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/migration/block.c b/migration/block.c index a15f9bddcb9d8abaa8e9a31dd9f72ff732395b59..710ef6f4909b905a82cd0926c39659f0a6ee14ef 100644 --- a/migration/block.c +++ b/migration/block.c @@ -409,7 +409,10 @@ static int init_blk_migration(QEMUFile *f) } sectors = bdrv_nb_sectors(bs); - if (sectors <= 0) { + if (sectors == 0) { + continue; + } + if (sectors < 0) { ret = sectors; bdrv_next_cleanup(&it); goto out; diff --git a/migration/channel.c b/migration/channel.c index ca3319a30985c420020281391206ef418a014e3c..f9de064f3b134dfe9a2128e9189d5565dd7f81e7 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc, len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); - if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { - error_setg(errp, - "Failed to peek at channel"); + if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) { + return -1; + } + + if (len == 0) { + error_setg(errp, "Failed to peek at channel"); return -1; } diff --git a/migration/colo.c b/migration/colo.c index 4447e349149a19bf1ed9969e923b1ea8b2296989..8f301b7e57164d484b0ffb28778ba82269611e60 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -830,6 +830,16 @@ static void *colo_process_incoming_thread(void *opaque) return NULL; } + /* Make sure all file formats throw away their mutable metadata */ + qemu_mutex_lock_iothread(); + bdrv_activate_all(&local_err); + if (local_err) { + qemu_mutex_unlock_iothread(); + error_report_err(local_err); + return NULL; + } + qemu_mutex_unlock_iothread(); + failover_init_state(); mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); @@ -917,7 +927,6 @@ out: int coroutine_fn colo_incoming_co(void) { MigrationIncomingState *mis = migration_incoming_get_current(); - Error *local_err = NULL; QemuThread th; assert(qemu_mutex_iothread_locked()); @@ -926,13 +935,6 @@ int coroutine_fn colo_incoming_co(void) return 0; } - /* Make sure all file formats throw away their mutable metadata */ - bdrv_activate_all(&local_err); - if (local_err) { - error_report_err(local_err); - return -EINVAL; - } - qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE); diff --git a/migration/file.c b/migration/file.c index 5d4975f43e198b6dfdc86a6c96ddcd1b2f09e3c1..fb3f743e54e6699ce037bc501f9cd136a0fc43be 100644 --- a/migration/file.c +++ b/migration/file.c @@ -46,12 +46,19 @@ void file_start_outgoing_migration(MigrationState *s, trace_migration_file_outgoing(filename); - fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC, - 0600, errp); + fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY, 0600, errp); if (!fioc) { return; } + if (ftruncate(fioc->fd, offset)) { + error_setg_errno(errp, errno, + "failed to truncate migration file to offset %" PRIx64, + offset); + object_unref(OBJECT(fioc)); + return; + } + ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { return; diff --git a/migration/meson.build b/migration/meson.build index 92b1cc429783cf040fec324ebb111105d1b68284..aba2581705cb53de18765bbe854926a58a2c21cc 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,7 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', - 'ram-compress.c', + 'multifd-zero-page.c', 'options.c', 'postcopy-ram.c', 'savevm.c', @@ -40,7 +40,11 @@ if get_option('live_block_migration').allowed() system_ss.add(files('block.c')) endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) +system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) +system_ss.add(when: uadk, if_true: files('multifd-uadk.c')) +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', - 'target.c')) + 'target.c', + 'ram-compress.c')) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 86ae832176a03512e4d5a4c9896facebc55edf4d..d6d5f373a198505cf6a36ac679ab80af8f3dc10b 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -30,6 +30,7 @@ #include "sysemu/runstate.h" #include "ui/qemu-spice.h" #include "sysemu/sysemu.h" +#include "sysemu/kvm.h" #include "options.h" #include "migration.h" @@ -291,6 +292,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) MigrationParameter_str(MIGRATION_PARAMETER_DECOMPRESS_THREADS), params->decompress_threads); assert(params->has_throttle_trigger_threshold); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_COMPRESS_METHOD), + CompressMethod_str(params->compress_method)); monitor_printf(mon, "%s: %u\n", MigrationParameter_str(MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD), params->throttle_trigger_threshold); @@ -344,6 +348,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -392,6 +401,24 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MODE), qapi_enum_lookup(&MigMode_lookup, params->mode)); + + assert(params->sev_pdh); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_PDH), + params->sev_pdh); + assert(params->sev_plat_cert); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_PLAT_CERT), + params->sev_plat_cert); + assert(params->sev_amd_cert); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_SEV_AMD_CERT), + params->sev_amd_cert); + + assert(params->has_hdbss_buffer_size); + monitor_printf(mon, "%s: %u\n", + MigrationParameter_str(MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE), + params->hdbss_buffer_size); } qapi_free_MigrationParameters(params); @@ -403,6 +430,11 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict) const char *name = qdict_get_str(qdict, "name"); Error *err = NULL; + if (virtcca_cvm_enabled()) { + error_setg(&err, "The loadvm command is temporarily unsupported in cvm."); + return; + } + vm_stop(RUN_STATE_RESTORE_VM); if (load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { @@ -519,6 +551,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) MigrateSetParameters *p = g_new0(MigrateSetParameters, 1); uint64_t valuebw = 0; uint64_t cache_size; + CompressMethod compress_method; Error *err = NULL; int val, ret; @@ -544,6 +577,14 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_decompress_threads = true; visit_type_uint8(v, param, &p->decompress_threads, &err); break; + case MIGRATION_PARAMETER_COMPRESS_METHOD: + p->has_compress_method = true; + visit_type_CompressMethod(v, param, &compress_method, &err); + if (err) { + break; + } + p->compress_method = compress_method; + break; case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: p->has_throttle_trigger_threshold = true; visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err); @@ -628,10 +669,18 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zlib_level = true; visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; + case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: + p->has_multifd_qatzip_level = true; + visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); + break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { @@ -679,6 +728,25 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_mode = true; visit_type_MigMode(v, param, &p->mode, &err); break; + case MIGRATION_PARAMETER_SEV_PDH: + p->sev_pdh = g_new0(StrOrNull, 1); + p->sev_pdh->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_pdh->u.s, &err); + break; + case MIGRATION_PARAMETER_SEV_PLAT_CERT: + p->sev_plat_cert = g_new0(StrOrNull, 1); + p->sev_plat_cert->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_plat_cert->u.s, &err); + break; + case MIGRATION_PARAMETER_SEV_AMD_CERT: + p->sev_amd_cert = g_new0(StrOrNull, 1); + p->sev_amd_cert->type = QTYPE_QSTRING; + visit_type_str(v, param, &p->sev_amd_cert->u.s, &err); + break; + case MIGRATION_PARAMETER_HDBSS_BUFFER_SIZE: + p->has_hdbss_buffer_size = true; + visit_type_uint8(v, param, &p->hdbss_buffer_size, &err); + break; default: assert(0); } diff --git a/migration/migration.c b/migration/migration.c index 3ce04b2aafe2cf40ca47af3743938a54626d2383..526f926b797279958db61c1b163e60f76577e384 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -68,6 +68,8 @@ #include "sysemu/dirtylimit.h" #include "qemu/sockets.h" +#define DEFAULT_FD_MAX 4096 + static NotifierList migration_state_notifiers = NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); @@ -99,7 +101,6 @@ static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, int new_state); -static void migrate_fd_cancel(MigrationState *s); static bool close_return_path_on_source(MigrationState *s); static void migration_downtime_start(MigrationState *s) @@ -128,11 +129,17 @@ static bool migration_needs_multiple_sockets(void) return migrate_multifd() || migrate_postcopy_preempt(); } -static bool transport_supports_multi_channels(SocketAddress *saddr) +static bool transport_supports_multi_channels(MigrationAddress *addr) { - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + SocketAddress *saddr = &addr->u.socket; + + return saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + } + + return false; } static bool @@ -140,8 +147,7 @@ migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { if (migration_needs_multiple_sockets() && - (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) && - !transport_supports_multi_channels(&addr->u.socket)) { + !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); return false; } @@ -270,8 +276,13 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); + /* + * RAM state cleanup needs to happen after multifd cleanup, because + * multifd threads can use some of its states (receivedmap). + */ + qemu_loadvm_state_cleanup(); if (mis->to_src_file) { /* Tell source that we are done */ @@ -623,7 +634,7 @@ static void process_incoming_migration_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); - multifd_load_shutdown(); + multifd_recv_shutdown(); dirty_bitmap_mig_before_vm_start(); @@ -699,6 +710,13 @@ process_incoming_migration_co(void *opaque) } if (ret < 0) { + MigrationState *s = migrate_get_current(); + + if (migrate_has_error(s)) { + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + } + } error_report("load of migration failed: %s", strerror(-ret)); goto fail; } @@ -715,7 +733,7 @@ fail: MIGRATION_STATUS_FAILED); qemu_fclose(mis->from_src_file); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); exit(EXIT_FAILURE); @@ -848,8 +866,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) default_channel = !mis->from_src_file; } - if (multifd_load_setup(errp) != 0) { - error_setg(errp, "Failed to setup multifd channels"); + if (multifd_recv_setup(errp) != 0) { return; } @@ -1301,7 +1318,7 @@ static void migrate_fd_cleanup(MigrationState *s) } qemu_mutex_lock_iothread(); - multifd_save_cleanup(); + multifd_send_shutdown(); qemu_mutex_lock(&s->qemu_file_lock); tmp = s->to_dst_file; s->to_dst_file = NULL; @@ -1377,7 +1394,7 @@ static void migrate_error_free(MigrationState *s) } } -static void migrate_fd_error(MigrationState *s, const Error *error) +void migrate_fd_error(MigrationState *s, const Error *error) { trace_migrate_fd_error(error_get_pretty(error)); assert(s->to_dst_file == NULL); @@ -1386,7 +1403,7 @@ static void migrate_fd_error(MigrationState *s, const Error *error) migrate_set_error(s, error); } -static void migrate_fd_cancel(MigrationState *s) +void migrate_fd_cancel(MigrationState *s) { int old_state ; @@ -1713,6 +1730,31 @@ void migrate_del_blocker(Error **reasonp) } } +/* + * Kernel will expand the fatable allocated to the qemu process when + * the number of fds held by qemu process exceeds a power of 2 (starting from 64). + * Each expansion introduces tens of ms of latency due to RCU synchronization. + * The expansion is completed during qemu process initialization to avoid + * triggering this action during the migration downtime phase. + */ +static void qemu_pre_extend_fdtable(void) +{ + int buffer[DEFAULT_FD_MAX] = {0}; + int i; + + /* expand fdtable */ + for (i = 0; i < DEFAULT_FD_MAX; i++) { + buffer[i] = qemu_dup(STDIN_FILENO); + } + + /* close tmp fd */ + for (i = 0; i < DEFAULT_FD_MAX; i++) { + if (buffer[i] > 0) { + (void)qemu_close(buffer[i]); + } + } +} + void qmp_migrate_incoming(const char *uri, bool has_channels, MigrationChannelList *channels, Error **errp) { @@ -1732,6 +1774,8 @@ void qmp_migrate_incoming(const char *uri, bool has_channels, return; } + qemu_pre_extend_fdtable(); + qemu_start_incoming_migration(uri, has_channels, channels, &local_err); if (local_err) { @@ -3299,6 +3343,9 @@ static void *migration_thread(void *opaque) MigThrError thr_error; bool urgent = false; + /* report migration thread pid to libvirt */ + qapi_event_send_migration_pid(qemu_get_thread_id()); + thread = migration_threads_add("live_migration", qemu_get_thread_id()); rcu_register_thread(); @@ -3306,6 +3353,10 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + if (!multifd_send_setup()) { + goto out; + } + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); qemu_mutex_unlock_iothread(); @@ -3377,6 +3428,7 @@ static void *migration_thread(void *opaque) urgent = migration_rate_limit(); } +out: trace_migration_thread_after_loop(); migration_iteration_finish(s); object_unref(OBJECT(s)); @@ -3630,15 +3682,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_save_setup(&local_err) != 0) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - migrate_fd_cleanup(s); - return; - } - if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE); diff --git a/migration/migration.h b/migration/migration.h index cf2c9c88e01d670b5b4c9d3de315a1b37a4f0d21..eeddb7c0bde3ec700200820aca6b99bb2544ee9d 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -482,6 +482,7 @@ bool migration_has_all_channels(void); uint64_t migrate_max_downtime(void); +void migrate_fd_error(MigrationState *s, const Error *error); void migrate_set_error(MigrationState *s, const Error *error); bool migrate_has_error(MigrationState *s); @@ -550,4 +551,8 @@ void migration_rp_kick(MigrationState *s); int migration_stop_vm(RunState state); +void migrate_fd_cancel(MigrationState *s); + +bool memcrypt_enabled(void); + #endif diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file mode 100644 index 0000000000000000000000000000000000000000..88b6fb44ad66263629edf1e79f15fe39900c43c3 --- /dev/null +++ b/migration/multifd-qatzip.c @@ -0,0 +1,395 @@ +/* + * Multifd QATzip compression implementation + * + * Copyright (c) Bytedance + * + * Authors: + * Bryan Zhang + * Hao Xiang + * Yichen Wang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "exec/ramblock.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-migration.h" +#include "options.h" +#include "multifd.h" +#include + +typedef struct { + /* + * Unique session for use with QATzip API + */ + QzSession_T sess; + + /* + * For compression: Buffer for pages to compress + * For decompression: Buffer for data to decompress + */ + uint8_t *in_buf; + uint32_t in_len; + + /* + * For compression: Output buffer of compressed data + * For decompression: Output buffer of decompressed data + */ + uint8_t *out_buf; + uint32_t out_len; +} QatzipData; + +/** + * qatzip_send_setup: Set up QATzip session and private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, 2); + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + /* Make sure to use configured QATzip compression level. */ + params.common_params.comp_lvl = migrate_multifd_qatzip_level(); + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + if (MULTIFD_PACKET_SIZE > UINT32_MAX) { + err_msg = "packet size too large for QAT"; + goto err; + } + + q->in_len = MULTIFD_PACKET_SIZE; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_send_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return None + */ +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + + g_free(p->iov); + p->iov = NULL; + p->compress_data = NULL; +} + +/** + * qatzip_send_prepare: Compress pages and update IO channel info. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) +{ + MultiFDPages_t *pages = p->pages; + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* + * Unlike other multifd compression implementations, we use a non-streaming + * API and place all the data into one buffer, rather than sending each + * page to the compression API at a time. Based on initial benchmarks, the + * non-streaming API outperforms the streaming API. Plus, the logic in QEMU + * is friendly to using the non-streaming API anyway. If either of these + * statements becomes no longer true, we can revisit adding a streaming + * implementation. + */ + for (int i = 0; i < pages->normal_num; i++) { + memcpy(q->in_buf + (i * p->page_size), + pages->block->host + pages->offset[i], + p->page_size); + } + + in_len = pages->normal_num * p->page_size; + if (in_len > q->in_len) { + error_setg(errp, "multifd %u: unexpectedly large input", p->id); + return -1; + } + out_len = q->out_len; + + ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", + p->id, ret); + return -1; + } + if (in_len != pages->normal_num * p->page_size) { + error_setg(errp, "multifd %u: QATzip failed to compress all input", + p->id); + return -1; + } + + p->iov[p->iovs_num].iov_base = q->out_buf; + p->iov[p->iovs_num].iov_len = out_len; + p->iovs_num++; + p->next_packet_size = out_len; + +out: + p->flags |= MULTIFD_FLAG_QATZIP; + multifd_send_fill_packet(p); + return 0; +} + +/** + * qatzip_recv_setup: Set up QATzip session and allocate private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + /* + * Reserve extra spaces for the incoming packets. Current implementation + * doesn't send uncompressed pages in case the compression gets too big. + */ + q->in_len = MULTIFD_PACKET_SIZE * 2; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = MULTIFD_PACKET_SIZE; + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @return None + */ +static void qatzip_recv_cleanup(MultiFDRecvParams *p) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + p->compress_data = NULL; +} + + +/** + * qatzip_recv: Decompress pages and copy them to the appropriate + * locations. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + uint32_t in_size = p->next_packet_size; + uint32_t expected_size = p->normal_num * p->page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (in_size > q->in_len) { + error_setg(errp, "multifd %u: received unexpectedly large packet", + p->id); + return -1; + } + + if (flags != MULTIFD_FLAG_QATZIP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QATZIP); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); + if (ret != 0) { + return ret; + } + + in_len = in_size; + out_len = q->out_len; + ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: qzDecompress failed", p->id); + return -1; + } + if (out_len != expected_size) { + error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_len, expected_size); + return -1; + } + + /* Copy each page to its appropriate location. */ + for (int i = 0; i < p->normal_num; i++) { + memcpy(p->host + p->normal[i], + q->out_buf + p->page_size * i, + p->page_size); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + return 0; +} + +static MultiFDMethods multifd_qatzip_ops = { + .send_setup = qatzip_send_setup, + .send_cleanup = qatzip_send_cleanup, + .send_prepare = qatzip_send_prepare, + .recv_setup = qatzip_recv_setup, + .recv_cleanup = qatzip_recv_cleanup, + .recv = qatzip_recv +}; + +static void multifd_qatzip_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); +} + +migration_init(multifd_qatzip_register); diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c new file mode 100644 index 0000000000000000000000000000000000000000..fea60e39379511dac03d63ada0fb0499f31e48ff --- /dev/null +++ b/migration/multifd-qpl.c @@ -0,0 +1,763 @@ +/* + * Multifd qpl compression accelerator implementation + * + * Copyright (c) 2023 Intel Corporation + * + * Authors: + * Yuan Liu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qapi-types-migration.h" +#include "exec/ramblock.h" +#include "multifd.h" +#include "qpl/qpl.h" + +/* Maximum number of retries to resubmit a job if IAA work queues are full */ +#define MAX_SUBMIT_RETRY_NUM (3) + +typedef struct { + /* the QPL hardware path job */ + qpl_job *job; + /* indicates if fallback to software path is required */ + bool fallback_sw_path; + /* output data from the software path */ + uint8_t *sw_output; + /* output data length from the software path */ + uint32_t sw_output_len; +} QplHwJob; + +typedef struct { + /* array of hardware jobs, the number of jobs equals the number pages */ + QplHwJob *hw_jobs; + /* the QPL software job for the slow path and software fallback */ + qpl_job *sw_job; + /* the number of pages that the QPL needs to process at one time */ + uint32_t page_num; + /* array of compressed page buffers */ + uint8_t *zbuf; + /* array of compressed page lengths */ + uint32_t *zlen; + /* the status of the hardware device */ + bool hw_avail; +} QplData; + +/** + * check_hw_avail: check if IAA hardware is available + * + * If the IAA hardware does not exist or is unavailable, + * the QPL hardware job initialization will fail. + * + * Returns true if IAA hardware is available, otherwise false. + * + * @job_size: indicates the hardware job size if hardware is available + */ +static bool check_hw_avail(uint32_t *job_size) +{ + qpl_path_t path = qpl_path_hardware; + uint32_t size = 0; + qpl_job *job; + + if (qpl_get_job_size(path, &size) != QPL_STS_OK) { + return false; + } + assert(size > 0); + job = g_malloc0(size); + if (qpl_init_job(path, job) != QPL_STS_OK) { + g_free(job); + return false; + } + g_free(job); + *job_size = size; + return true; +} + +/** + * multifd_qpl_free_sw_job: clean up software job + * + * Free the software job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_sw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->sw_job) { + qpl_fini_job(qpl->sw_job); + g_free(qpl->sw_job); + qpl->sw_job = NULL; + } +} + +/** + * multifd_qpl_free_jobs: clean up hardware jobs + * + * Free all hardware job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_hw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->hw_jobs) { + for (int i = 0; i < qpl->page_num; i++) { + qpl_fini_job(qpl->hw_jobs[i].job); + g_free(qpl->hw_jobs[i].job); + qpl->hw_jobs[i].job = NULL; + } + g_free(qpl->hw_jobs); + qpl->hw_jobs = NULL; + } +} + +/** + * multifd_qpl_init_sw_job: initialize a software job + * + * Use the QPL software path to initialize a job + * + * @qpl: pointer to the QplData structure + * @errp: pointer to an error + */ +static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp) +{ + qpl_path_t path = qpl_path_software; + uint32_t size = 0; + qpl_job *job = NULL; + qpl_status status; + + status = qpl_get_job_size(path, &size); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_get_job_size failed with error %d", status); + return -1; + } + job = g_malloc0(size); + status = qpl_init_job(path, job); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_init_job failed with error %d", status); + g_free(job); + return -1; + } + qpl->sw_job = job; + return 0; +} + +/** + * multifd_qpl_init_jobs: initialize hardware jobs + * + * Use the QPL hardware path to initialize jobs + * + * @qpl: pointer to the QplData structure + * @size: the size of QPL hardware path job + * @errp: pointer to an error + */ +static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp) +{ + qpl_path_t path = qpl_path_hardware; + qpl_job *job = NULL; + qpl_status status; + + qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num); + for (int i = 0; i < qpl->page_num; i++) { + job = g_malloc0(size); + status = qpl_init_job(path, job); + /* the job initialization should succeed after check_hw_avail */ + assert(status == QPL_STS_OK); + qpl->hw_jobs[i].job = job; + } +} + +/** + * multifd_qpl_init: initialize QplData structure + * + * Allocate and initialize a QplData structure + * + * Returns a QplData pointer on success or NULL on error + * + * @num: the number of pages + * @size: the page size + * @errp: pointer to an error + */ +static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp) +{ + uint32_t job_size = 0; + QplData *qpl; + + qpl = g_new0(QplData, 1); + qpl->page_num = num; + if (multifd_qpl_init_sw_job(qpl, errp) != 0) { + g_free(qpl); + return NULL; + } + qpl->hw_avail = check_hw_avail(&job_size); + if (qpl->hw_avail) { + multifd_qpl_init_hw_job(qpl, job_size, errp); + } + qpl->zbuf = g_malloc0(size * num); + qpl->zlen = g_new0(uint32_t, num); + return qpl; +} + +/** + * multifd_qpl_deinit: clean up QplData structure + * + * Free jobs, buffers and the QplData structure + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_deinit(QplData *qpl) +{ + if (qpl) { + multifd_qpl_free_sw_job(qpl); + multifd_qpl_free_hw_job(qpl); + g_free(qpl->zbuf); + g_free(qpl->zlen); + g_free(qpl); + } +} + +/** + * multifd_qpl_send_setup: set up send side + * + * Set up the channel with QPL compression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + + /* + * the page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_qpl_send_cleanup: clean up send side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; +} + +/** + * multifd_qpl_prepare_job: prepare the job + * + * Set the QPL job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @is_compression: indicates compression and decompression + * @input: pointer to the input data buffer + * @input_len: the length of the input data + * @output: pointer to the output data buffer + * @output_len: the length of the output data + */ +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression, + uint8_t *input, uint32_t input_len, + uint8_t *output, uint32_t output_len) +{ + job->op = is_compression ? qpl_op_compress : qpl_op_decompress; + job->next_in_ptr = input; + job->next_out_ptr = output; + job->available_in = input_len; + job->available_out = output_len; + job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY; + /* only supports compression level 1 */ + job->level = 1; +} + +/** + * multifd_qpl_prepare_comp_job: prepare the compression job + * + * Set the compression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input, + uint8_t *output, uint32_t size) +{ + /* + * Set output length to less than the page size to force the job to + * fail in case it compresses to a larger size. We'll send that page + * without compression and skip the decompression operation on the + * destination. + */ + multifd_qpl_prepare_job(job, true, input, size, output, size - 1); +} + +/** + * multifd_qpl_prepare_decomp_job: prepare the decompression job + * + * Set the decompression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @len: the length of the input data + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input, + uint32_t len, uint8_t *output, + uint32_t size) +{ + multifd_qpl_prepare_job(job, false, input, len, output, size); +} + +/** + * multifd_qpl_fill_iov: fill in the IOV + * + * Fill in the QPL packet IOV + * + * @p: Params for the channel being used + * @data: pointer to the IOV data + * @len: The length of the IOV data + */ +static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = data; + p->iov[p->iovs_num].iov_len = len; + p->iovs_num++; + p->next_packet_size += len; +} + +/** + * multifd_qpl_fill_packet: fill the compressed page into the QPL packet + * + * Fill the compressed page length and IOV into the QPL packet + * + * @idx: The index of the compressed length array + * @p: Params for the channel being used + * @data: pointer to the compressed page buffer + * @len: The length of the compressed page + */ +static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p, + uint8_t *data, uint32_t len) +{ + QplData *qpl = p->compress_data; + + qpl->zlen[idx] = cpu_to_be32(len); + multifd_qpl_fill_iov(p, data, len); +} + +/** + * multifd_qpl_submit_job: submit a job to the hardware + * + * Submit a QPL hardware job to the IAA device + * + * Returns true if the job is submitted successfully, otherwise false. + * + * @job: pointer to the qpl_job structure + */ +static bool multifd_qpl_submit_job(qpl_job *job) +{ + qpl_status status; + uint32_t num = 0; + +retry: + status = qpl_submit_job(job); + if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) { + if (num < MAX_SUBMIT_RETRY_NUM) { + num++; + goto retry; + } + } + return (status == QPL_STS_OK); +} + +/** + * multifd_qpl_compress_pages_slow_path: compress pages using slow path + * + * Compress the pages using software. If compression fails, the uncompressed + * page will be sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *buf; + + for (int i = 0; i < p->pages->normal_num; i++) { + buf = p->pages->block->host + p->pages->offset[i]; + multifd_qpl_prepare_comp_job(job, buf, zbuf, size); + if (qpl_execute_job(job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + zbuf += size; + } +} + +/** + * multifd_qpl_compress_pages: compress pages + * + * Submit the pages to the IAA hardware for compression. If hardware + * compression fails, it falls back to software compression. If software + * compression also fails, the uncompressed page is sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + MultiFDPages_t *pages = p->pages; + uint32_t size = p->page_size; + QplHwJob *hw_job; + uint8_t *buf; + uint8_t *zbuf; + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size); + if (multifd_qpl_submit_job(hw_job->job)) { + hw_job->fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + hw_job->fallback_sw_path = true; + multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size); + if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) { + hw_job->sw_output = zbuf; + hw_job->sw_output_len = qpl->sw_job->total_out; + } else { + hw_job->sw_output = buf; + hw_job->sw_output_len = size; + } + } + } + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + if (hw_job->fallback_sw_path) { + multifd_qpl_fill_packet(i, p, hw_job->sw_output, + hw_job->sw_output_len); + continue; + } + if (qpl_wait_job(hw_job->job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + } +} + +/** + * multifd_qpl_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t len = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* The first IOV is used to store the compressed page lengths */ + len = p->pages->normal_num * sizeof(uint32_t); + multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); + if (qpl->hw_avail) { + multifd_qpl_compress_pages(p); + } else { + multifd_qpl_compress_pages_slow_path(p); + } + +out: + p->flags |= MULTIFD_FLAG_QPL; + multifd_send_fill_packet(p); + return 0; +} + +/** + * multifd_qpl_recv_setup: set up receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + return 0; +} + +/** + * multifd_qpl_recv_cleanup: set up receive side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; +} + +/** + * multifd_qpl_process_and_check_job: process and check a QPL job + * + * Process the job and check whether the job output length is the + * same as the specified length + * + * Returns true if the job execution succeeded and the output length + * is equal to the specified length, otherwise false. + * + * @job: pointer to the qpl_job structure + * @is_hardware: indicates whether the job is a hardware job + * @len: Specified output length + * @errp: pointer to an error + */ +static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware, + uint32_t len, Error **errp) +{ + qpl_status status; + + status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job)); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl job failed with error %d", status); + return false; + } + if (job->total_out != len) { + error_setg(errp, "qpl decompressed len %u, expected len %u", + job->total_out, len); + return false; + } + return true; +} + +/** + * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path + * + * Decompress the pages using software + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, + Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + + for (int i = 0; i < p->normal_num; i++) { + len = qpl->zlen[i]; + addr = p->host + p->normal[i]; + /* the page is uncompressed, load it */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + zbuf += len; + } + return 0; +} + +/** + * multifd_qpl_decompress_pages: decompress pages + * + * Decompress the pages using the IAA hardware. If hardware + * decompression fails, it falls back to software decompression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + qpl_job *job; + + for (int i = 0; i < p->normal_num; i++) { + addr = p->host + p->normal[i]; + len = qpl->zlen[i]; + /* the page is uncompressed if received length equals the page size */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + + job = qpl->hw_jobs[i].job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (multifd_qpl_submit_job(job)) { + qpl->hw_jobs[i].fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + qpl->hw_jobs[i].fallback_sw_path = true; + job = qpl->sw_job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + } + zbuf += len; + } + + for (int i = 0; i < p->normal_num; i++) { + /* ignore pages that have already been processed */ + if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) { + continue; + } + + job = qpl->hw_jobs[i].job; + if (!multifd_qpl_process_and_check_job(job, true, size, errp)) { + return -1; + } + } + return 0; +} +/** + * multifd_qpl_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t len = 0; + uint32_t zbuf_len = 0; + int ret; + + if (flags != MULTIFD_FLAG_QPL) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QPL); + return -1; + } + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed page lengths */ + len = p->normal_num * sizeof(uint32_t); + assert(len < in_size); + ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp); + if (ret != 0) { + return ret; + } + for (int i = 0; i < p->normal_num; i++) { + qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); + assert(qpl->zlen[i] <= p->page_size); + zbuf_len += qpl->zlen[i]; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); + } + + /* read compressed pages */ + assert(in_size == len + zbuf_len); + ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp); + if (ret != 0) { + return ret; + } + + if (qpl->hw_avail) { + return multifd_qpl_decompress_pages(p, errp); + } + return multifd_qpl_decompress_pages_slow_path(p, errp); +} + +static MultiFDMethods multifd_qpl_ops = { + .send_setup = multifd_qpl_send_setup, + .send_cleanup = multifd_qpl_send_cleanup, + .send_prepare = multifd_qpl_send_prepare, + .recv_setup = multifd_qpl_recv_setup, + .recv_cleanup = multifd_qpl_recv_cleanup, + .recv = multifd_qpl_recv, +}; + +static void multifd_qpl_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops); +} + +migration_init(multifd_qpl_register); diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c new file mode 100644 index 0000000000000000000000000000000000000000..9a582fc91986981a612e77f1da03af3cacd6e7e6 --- /dev/null +++ b/migration/multifd-uadk.c @@ -0,0 +1,371 @@ +/* + * Multifd UADK compression accelerator implementation + * + * Copyright (c) 2024 Huawei Technologies R & D (UK) Ltd + * + * Authors: + * Shameer Kolothum + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/module.h" +#include "qapi/error.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "qemu/error-report.h" +#include "uadk/wd_comp.h" +#include "uadk/wd_sched.h" + +struct wd_data { + handle_t handle; + uint8_t *buf; + uint32_t *buf_hdr; +}; + +static bool uadk_hw_init(void) +{ + char alg[] = "zlib"; + int ret; + + ret = wd_comp_init2(alg, SCHED_POLICY_RR, TASK_HW); + if (ret && ret != -WD_EEXIST) { + return false; + } else { + return true; + } +} + +static struct wd_data *multifd_uadk_init_sess(uint32_t count, + uint32_t page_size, + bool compress, Error **errp) +{ + struct wd_comp_sess_setup ss = {0}; + struct sched_params param = {0}; + uint32_t size = count * page_size; + struct wd_data *wd; + + wd = g_new0(struct wd_data, 1); + + if (uadk_hw_init()) { + ss.alg_type = WD_ZLIB; + if (compress) { + ss.op_type = WD_DIR_COMPRESS; + /* Add an additional page for handling output > input */ + size += page_size; + } else { + ss.op_type = WD_DIR_DECOMPRESS; + } + /* We use default level 1 compression and 4K window size */ + param.type = ss.op_type; + ss.sched_param = ¶m; + + wd->handle = wd_comp_alloc_sess(&ss); + if (!wd->handle) { + error_setg(errp, "multifd: failed wd_comp_alloc_sess"); + goto out; + } + } else { + /* For CI test use */ + warn_report_once("UADK hardware not available. Switch to no compression mode"); + } + + wd->buf = g_try_malloc(size); + if (!wd->buf) { + error_setg(errp, "multifd: out of mem for uadk buf"); + goto out_free_sess; + } + wd->buf_hdr = g_new0(uint32_t, count); + return wd; + +out_free_sess: + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } +out: + wd_comp_uninit2(); + g_free(wd); + return NULL; +} + +static void multifd_uadk_uninit_sess(struct wd_data *wd) +{ + if (wd->handle) { + wd_comp_free_sess(wd->handle); + } + wd_comp_uninit2(); + g_free(wd->buf); + g_free(wd->buf_hdr); + g_free(wd); +} + +/** + * multifd_uadk_send_setup: setup send side + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_setup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, true, errp); + if (!wd) { + return -1; + } + + p->compress_data = wd; + assert(p->iov == NULL); + /* + * Each page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_uadk_send_cleanup: cleanup send side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static void multifd_uadk_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; +} + +static inline void prepare_next_iov(MultiFDSendParams *p, void *base, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = (uint8_t *)base; + p->iov[p->iovs_num].iov_len = len; + p->next_packet_size += len; + p->iovs_num++; +} + +/** + * multifd_uadk_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp) +{ + struct wd_data *uadk_data = p->compress_data; + uint32_t hdr_size; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + hdr_size = p->pages->normal_num * sizeof(uint32_t); + /* prepare the header that stores the lengths of all compressed data */ + prepare_next_iov(p, uadk_data->buf_hdr, hdr_size); + + for (int i = 0; i < p->pages->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_COMPRESS, + .src = p->pages->block->host + p->pages->offset[i], + .src_len = p->page_size, + .dst = buf, + /* Set dst_len to double the src in case compressed out >= page_size */ + .dst_len = p->page_size * 2, + }; + + if (uadk_data->handle) { + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed compression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len < p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(creq.dst_len); + prepare_next_iov(p, buf, creq.dst_len); + buf += creq.dst_len; + } + } + /* + * Send raw data if no UADK hardware or if compressed out >= page_size. + * We might be better off sending raw data if output is slightly less + * than page_size as well because at the receive end we can skip the + * decompression. But it is tricky to find the right number here. + */ + if (!uadk_data->handle || creq.dst_len >= p->page_size) { + uadk_data->buf_hdr[i] = cpu_to_be32(p->page_size); + prepare_next_iov(p, p->pages->block->host + p->pages->offset[i], + p->page_size); + buf += p->page_size; + } + } +out: + p->flags |= MULTIFD_FLAG_UADK; + multifd_send_fill_packet(p); + return 0; +} + +/** + * multifd_uadk_recv_setup: setup receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + struct wd_data *wd; + + wd = multifd_uadk_init_sess(p->page_count, p->page_size, false, errp); + if (!wd) { + return -1; + } + p->compress_data = wd; + return 0; +} + +/** + * multifd_uadk_recv_cleanup: cleanup receive side + * + * Close the channel and return memory. + * + * @p: Params for the channel that we are using + */ +static void multifd_uadk_recv_cleanup(MultiFDRecvParams *p) +{ + struct wd_data *wd = p->compress_data; + + multifd_uadk_uninit_sess(wd); + p->compress_data = NULL; +} + +/** + * multifd_uadk_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 for success or -1 for error + * + * @p: Params for the channel that we are using + * @errp: pointer to an error + */ +static int multifd_uadk_recv(MultiFDRecvParams *p, Error **errp) +{ + struct wd_data *uadk_data = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t hdr_len = p->normal_num * sizeof(uint32_t); + uint32_t data_len = 0; + uint8_t *buf = uadk_data->buf; + int ret = 0; + + if (flags != MULTIFD_FLAG_UADK) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_ZLIB); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed data lengths */ + assert(hdr_len < in_size); + ret = qio_channel_read_all(p->c, (void *) uadk_data->buf_hdr, + hdr_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + uadk_data->buf_hdr[i] = be32_to_cpu(uadk_data->buf_hdr[i]); + data_len += uadk_data->buf_hdr[i]; + assert(uadk_data->buf_hdr[i] <= p->page_size); + } + + /* read compressed data */ + assert(in_size == hdr_len + data_len); + ret = qio_channel_read_all(p->c, (void *)buf, data_len, errp); + if (ret != 0) { + return ret; + } + + for (int i = 0; i < p->normal_num; i++) { + struct wd_comp_req creq = { + .op_type = WD_DIR_DECOMPRESS, + .src = buf, + .src_len = uadk_data->buf_hdr[i], + .dst = p->host + p->normal[i], + .dst_len = p->page_size, + }; + + if (uadk_data->buf_hdr[i] == p->page_size) { + memcpy(p->host + p->normal[i], buf, p->page_size); + buf += p->page_size; + continue; + } + + if (unlikely(!uadk_data->handle)) { + error_setg(errp, "multifd %u: UADK HW not available for decompression", + p->id); + return -1; + } + + ret = wd_do_comp_sync(uadk_data->handle, &creq); + if (ret || creq.status) { + error_setg(errp, "multifd %u: failed decompression, ret %d status %d", + p->id, ret, creq.status); + return -1; + } + if (creq.dst_len != p->page_size) { + error_setg(errp, "multifd %u: decompressed length error", p->id); + return -1; + } + buf += uadk_data->buf_hdr[i]; + } + + return 0; +} + +static MultiFDMethods multifd_uadk_ops = { + .send_setup = multifd_uadk_send_setup, + .send_cleanup = multifd_uadk_send_cleanup, + .send_prepare = multifd_uadk_send_prepare, + .recv_setup = multifd_uadk_recv_setup, + .recv_cleanup = multifd_uadk_recv_cleanup, + .recv = multifd_uadk_recv, +}; + +static void multifd_uadk_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_UADK, &multifd_uadk_ops); +} +migration_init(multifd_uadk_register); diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 0000000000000000000000000000000000000000..e1b8370f88e1ef91d98eb2375b242327c7c5bdc4 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,89 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { + memset(page, 0, p->page_size); + } else { + ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37ce48621e7866da0995a44be6977daa3fde591a..2df498378023db1f6727ba4eb562e529f4c849e5 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,11 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); + return 0; err_free_zbuff: @@ -92,15 +96,18 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** @@ -116,17 +123,22 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + MultiFDPages_t *pages = p->pages; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; uint32_t i; - for (i = 0; i < p->normal_num; i++) { + if (!multifd_send_prepare_common(p)) { + goto out; + } + + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == p->normal_num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -135,7 +147,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); zs->avail_in = p->page_size; zs->next_in = z->buf; @@ -169,8 +181,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; - p->flags |= MULTIFD_FLAG_ZLIB; +out: + p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); return 0; } @@ -189,7 +203,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -219,17 +233,17 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -239,9 +253,9 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ @@ -256,6 +270,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -269,6 +291,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) int flush = Z_NO_FLUSH; unsigned long start = zs->total_out; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); if (i == p->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -305,6 +328,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } @@ -314,7 +338,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index b471daadcd083a04fb495b3d1e06436d119d4c24..46ee68b6cef1cf85f23b946bbd50d9eebdb82dde 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) error_setg(errp, "multifd %u: out of memory for zbuff", p->id); return -1; } + p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); return 0; } @@ -90,14 +93,17 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** @@ -113,21 +119,26 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + MultiFDPages_t *pages = p->pages; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; + if (!multifd_send_prepare_common(p)) { + goto out; + } + z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == p->normal_num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + p->normal[i]; + z->in.src = p->pages->block->host + pages->offset[i]; z->in.size = p->page_size; z->in.pos = 0; @@ -141,9 +152,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) */ do { ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush); - } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.size - z->out.pos > 0)); - if (ret > 0 && (z->in.size - z->in.pos > 0)) { + } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.size > z->out.pos)); + if (ret > 0 && (z->in.size > z->in.pos)) { error_setg(errp, "multifd %u: compressStream buffer too small", p->id); return -1; @@ -158,8 +169,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; - p->flags |= MULTIFD_FLAG_ZSTD; +out: + p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); return 0; } @@ -178,7 +191,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -216,18 +229,18 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -237,13 +250,13 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; @@ -252,6 +265,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -263,6 +284,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) z->in.pos = 0; for (i = 0; i < p->normal_num; i++) { + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); z->out.dst = p->host + p->normal[i]; z->out.size = p->page_size; z->out.pos = 0; @@ -277,7 +299,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) */ do { ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); - } while (ret > 0 && (z->in.size - z->in.pos > 0) + } while (ret > 0 && (z->in.size > z->in.pos) && (z->out.pos < p->page_size)); if (ret > 0 && (z->out.pos < p->page_size)) { error_setg(errp, "multifd %u: decompressStream buffer too small", @@ -305,7 +327,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 409460684f29a4671f3da69fe5da20c02322de62..4c310deb61b080bf91a7eee6321bcfc1d2ac1c34 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,12 +11,15 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" #include "exec/ramblock.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" +#include "qapi/qapi-events-migration.h" #include "ram.h" #include "migration.h" #include "migration-stats.h" @@ -45,20 +48,79 @@ typedef struct { uint64_t unused2[4]; /* Reserved for future use */ } __attribute__((packed)) MultiFDInit_t; +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* + * Global number of generated multifd packets. + * + * Note that we used 'uintptr_t' because it'll naturally support atomic + * operations on both 32bit / 64 bits hosts. It means on 32bit systems + * multifd will overflow the packet_num easier, but that should be + * fine. + * + * Another option is to use QEMU's Stat64 then it'll be 64 bits on all + * hosts, however so far it does not support atomic fetch_add() yet. + * Make it easy for now. + */ + uintptr_t packet_num; + /* + * Synchronization point past which no more channels will be + * created. + */ + QemuSemaphore channels_created; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + +struct { + MultiFDRecvParams *params; + /* number of created threads */ + int count; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* global number of generated multifd packets */ + uint64_t packet_num; + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_recv_state; + +static bool multifd_use_packets(void) +{ + return true; +} + /* Multifd without compression */ /** * nocomp_send_setup: setup send side * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * * @p: Params for the channel that we are using * @errp: pointer to an error */ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) { + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + + if (multifd_use_packets()) { + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, p->page_count + 1); + } else { + p->iov = g_new0(struct iovec, p->page_count); + } + return 0; } @@ -72,9 +134,24 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) */ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) { + g_free(p->iov); + p->iov = NULL; return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->normal_num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->normal_num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -88,16 +165,38 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { - MultiFDPages_t *pages = p->pages; + bool use_zero_copy_send = migrate_zero_copy_send(); + int ret; - for (int i = 0; i < p->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; + multifd_send_zero_page_detect(p); + + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + return 0; + } + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); } - p->next_packet_size = p->normal_num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + } + return 0; } @@ -113,6 +212,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) */ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) { + p->iov = g_new0(struct iovec, p->page_count); return 0; } @@ -125,10 +225,12 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void nocomp_recv_cleanup(MultiFDRecvParams *p) { + g_free(p->iov); + p->iov = NULL; } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -137,18 +239,32 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return 0; + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); } @@ -159,7 +275,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -172,6 +288,18 @@ void multifd_register_ops(int method, MultiFDMethods *ops) multifd_ops[method] = ops; } +/* Reset a MultiFDPages_t* object for the next use */ +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->normal_num = 0; + pages->block = NULL; +} + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -228,56 +356,68 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) } if (msg.id > migrate_multifd_channels()) { - error_setg(errp, "multifd: received channel version %u " - "expected %u", msg.version, MULTIFD_VERSION); + error_setg(errp, "multifd: received channel id %u is greater than " + "number of channels %u", msg.id, migrate_multifd_channels()); return -1; } return msg.id; } -static MultiFDPages_t *multifd_pages_init(size_t size) +static MultiFDPages_t *multifd_pages_init(uint32_t n) { MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); - pages->allocated = size; - pages->offset = g_new0(ram_addr_t, size); + pages->allocated = n; + pages->offset = g_new0(ram_addr_t, n); return pages; } static void multifd_pages_clear(MultiFDPages_t *pages) { - pages->num = 0; + multifd_pages_reset(pages); pages->allocated = 0; - pages->packet_num = 0; - pages->block = NULL; g_free(pages->offset); pages->offset = NULL; g_free(pages); } -static void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; + uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(p->normal_num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); - packet->packet_num = cpu_to_be64(p->packet_num); - if (p->pages->block) { - strncpy(packet->ramblock, p->pages->block->idstr, 256); + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); + packet->packet_num = cpu_to_be64(packet_num); + + if (pages->block) { + pstrcpy(packet->ramblock, sizeof(packet->ramblock), + pages->block->idstr); } - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = p->normal[i]; + uint64_t temp = pages->offset[i]; packet->offset[i] = cpu_to_be64(temp); } + + p->packets_sent++; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; + + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -318,15 +458,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; + p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; + + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -352,26 +506,42 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } -struct { - MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* send channels ready */ - QemuSemaphore channels_ready; - /* - * Have we already run terminate threads. There is a race when it - * happens that we got one error while we are exiting. - * We will use atomic operations. Only valid values are 0 and 1. - */ - int exiting; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_send_state; +static bool multifd_send_should_exit(void) +{ + return qatomic_read(&multifd_send_state->exiting); +} + +static bool multifd_recv_should_exit(void) +{ + return qatomic_read(&multifd_recv_state->exiting); +} + +/* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of + * them. Should mostly only be called when something wrong happened with + * the current multifd send thread. + */ +static void multifd_send_kick_main(MultiFDSendParams *p) +{ + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); +} /* * How we use multifd_send_state->pages and channel->pages? @@ -389,20 +559,23 @@ struct { * thread is using the channel mutex when changing it, and the channel * have to had finish with its own, otherwise pending_job can't be * false. + * + * Returns true if succeed, false otherwise. */ - -static int multifd_send_pages(QEMUFile *f) +static bool multifd_send_pages(void) { int i; static int next_channel; MultiFDSendParams *p = NULL; /* make happy gcc */ MultiFDPages_t *pages = multifd_send_state->pages; - if (qatomic_read(&multifd_send_state->exiting)) { - return -1; + if (multifd_send_should_exit()) { + return false; } + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); + /* * next_channel can remain from a previous migration that was * using more channels, so ensure it doesn't overflow if the @@ -410,69 +583,100 @@ static int multifd_send_pages(QEMUFile *f) */ next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { - p = &multifd_send_state->params[i]; - - qemu_mutex_lock(&p->mutex); - if (p->quit) { - error_report("%s: channel %d has already quit!", __func__, i); - qemu_mutex_unlock(&p->mutex); - return -1; + if (multifd_send_should_exit()) { + return false; } - if (!p->pending_job) { - p->pending_job++; + p = &multifd_send_state->params[i]; + /* + * Lockless read to p->pending_job is safe, because only multifd + * sender thread can clear it. + */ + if (qatomic_read(&p->pending_job) == false) { next_channel = (i + 1) % migrate_multifd_channels(); break; } - qemu_mutex_unlock(&p->mutex); } - assert(!p->pages->num); - assert(!p->pages->block); - p->packet_num = multifd_send_state->packet_num++; + /* + * Make sure we read p->pending_job before all the rest. Pairs with + * qatomic_store_release() in multifd_send_thread(). + */ + smp_mb_acquire(); + assert(!p->pages->num); multifd_send_state->pages = p->pages; p->pages = pages; - qemu_mutex_unlock(&p->mutex); + /* + * Making sure p->pages is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). + */ + qatomic_store_release(&p->pending_job, true); qemu_sem_post(&p->sem); - return 1; + return true; } -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +static inline bool multifd_queue_empty(MultiFDPages_t *pages) { - MultiFDPages_t *pages = multifd_send_state->pages; - bool changed = false; + return pages->num == 0; +} - if (!pages->block) { - pages->block = block; - } +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == pages->allocated; +} - if (pages->block == block) { - pages->offset[pages->num] = offset; - pages->num++; +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} - if (pages->num < pages->allocated) { - return 1; - } - } else { - changed = true; - } +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) +{ + MultiFDPages_t *pages; - if (multifd_send_pages(f) < 0) { - return -1; +retry: + pages = multifd_send_state->pages; + + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { + pages->block = block; + multifd_enqueue(pages, offset); + return true; } - if (changed) { - return multifd_queue_page(f, block, offset); + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send_pages()) { + return false; + } + goto retry; } - return 1; + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); + return true; } -static void multifd_send_terminate_threads(Error *err) +/* Multifd send side hit an error; remember it and prepare to quit */ +static void multifd_send_set_error(Error *err) { - int i; - - trace_multifd_send_terminate_threads(err != NULL); + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } if (err) { MigrationState *s = migrate_get_current(); @@ -485,86 +689,104 @@ static void multifd_send_terminate_threads(Error *err) MIGRATION_STATUS_FAILED); } } +} + +static void multifd_send_terminate_threads(void) +{ + int i; + + trace_multifd_send_terminate_threads(); /* - * We don't want to exit each threads twice. Depending on where - * we get the error, or if there are two independent errors in two - * threads at the same time, we can end calling this function - * twice. + * Tell everyone we're quitting. No xchg() needed here; we simply + * always set it. */ - if (qatomic_xchg(&multifd_send_state->exiting, 1)) { - return; - } + qatomic_set(&multifd_send_state->exiting, 1); + /* + * Firstly, kick all threads out; no matter whether they are just idle, + * or blocked in an IO system call. + */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; qemu_sem_post(&p->sem); if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } + + /* + * Finally recycle all the threads. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->tls_thread_created) { + qemu_thread_join(&p->tls_thread); + } + + if (p->thread_created) { + qemu_thread_join(&p->thread); + } + } +} + +static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) +{ + if (p->c) { + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + } + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; } -static int multifd_send_channel_destroy(QIOChannel *send) +static void multifd_send_cleanup_state(void) { - return socket_send_channel_destroy(send); + socket_cleanup_outgoing_migration(); + qemu_sem_destroy(&multifd_send_state->channels_created); + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; } -void multifd_save_cleanup(void) +void multifd_send_shutdown(void) { int i; if (!migrate_multifd()) { return; } - multifd_send_terminate_threads(NULL); - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - if (p->running) { - qemu_thread_join(&p->thread); - } - } + multifd_send_terminate_threads(); + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; - if (p->registered_yank) { - migration_ioc_unregister_yank(p->c); - } - multifd_send_channel_destroy(p->c); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_send_state->ops->send_cleanup(p, &local_err); - if (local_err) { + if (!multifd_send_cleanup_channel(p, &local_err)) { migrate_set_error(migrate_get_current(), local_err); error_free(local_err); } } - qemu_sem_destroy(&multifd_send_state->channels_ready); - g_free(multifd_send_state->params); - multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; - g_free(multifd_send_state); - multifd_send_state = NULL; + + multifd_send_cleanup_state(); } static int multifd_zero_copy_flush(QIOChannel *c) @@ -584,7 +806,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(QEMUFile *f) +int multifd_send_sync_main(void) { int i; bool flush_zero_copy; @@ -593,47 +815,38 @@ int multifd_send_sync_main(QEMUFile *f) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages(f) < 0) { + if (!multifd_send_pages()) { error_report("%s: multifd_send_pages fail", __func__); return -1; } } - /* - * When using zero-copy, it's necessary to flush the pages before any of - * the pages can be sent again, so we'll make sure the new version of the - * pages will always arrive _later_ than the old pages. - * - * Currently we achieve this by flushing the zero-page requested writes - * per ram iteration, but in the future we could potentially optimize it - * to be less frequent, e.g. only after we finished one whole scanning of - * all the dirty bitmaps. - */ - flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - trace_multifd_send_sync_main_signal(p->id); - - qemu_mutex_lock(&p->mutex); - - if (p->quit) { - error_report("%s: channel %d has already quit", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } - p->packet_num = multifd_send_state->packet_num++; - p->flags |= MULTIFD_FLAG_SYNC; - p->pending_job++; - qemu_mutex_unlock(&p->mutex); + trace_multifd_send_sync_main_signal(p->id); + + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (multifd_send_should_exit()) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -653,75 +866,45 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_zero_copy_send(); + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); + /* report multifd thread pid to libvirt */ + qapi_event_send_migration_multifd_pid(qemu_get_thread_id()); + trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } - /* initial packet */ - p->num_packets = 1; while (true) { qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_wait(&p->sem); - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { break; } - qemu_mutex_lock(&p->mutex); - - if (p->pending_job) { - uint64_t packet_num = p->packet_num; - uint32_t flags; - p->normal_num = 0; - - if (use_zero_copy_send) { - p->iovs_num = 0; - } else { - p->iovs_num = 1; - } - for (int i = 0; i < p->pages->num; i++) { - p->normal[p->normal_num] = p->pages->offset[i]; - p->normal_num++; - } + /* + * Read pending_job flag before p->pages. Pairs with the + * qatomic_store_release() in multifd_send_pages(). + */ + if (qatomic_load_acquire(&p->pending_job)) { + MultiFDPages_t *pages = p->pages; - if (p->normal_num) { - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - qemu_mutex_unlock(&p->mutex); - break; - } - } - multifd_send_fill_packet(p); - flags = p->flags; p->flags = 0; - p->num_packets++; - p->total_normal_pages += p->normal_num; - p->pages->num = 0; - p->pages->block = NULL; - qemu_mutex_unlock(&p->mutex); - - trace_multifd_send(p->id, packet_num, p->normal_num, flags, - p->next_packet_size); + p->iovs_num = 0; + assert(pages->num); - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; - } - } else { - /* Send header using the same writev call */ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, @@ -731,18 +914,41 @@ static void *multifd_send_thread(void *opaque) } stat64_add(&mig_stats.multifd_bytes, - p->next_packet_size + p->packet_len); + (uint64_t)p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); + + multifd_pages_reset(p->pages); p->next_packet_size = 0; - qemu_mutex_lock(&p->mutex); - p->pending_job--; - qemu_mutex_unlock(&p->mutex); - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&p->sem_sync); - } + /* + * Making sure p->pages is published before saying "we're + * free". Pairs with the smp_mb_acquire() in + * multifd_send_pages(). + */ + qatomic_store_release(&p->pending_job, false); } else { - qemu_mutex_unlock(&p->mutex); - /* sometimes there are spurious wakeups */ + /* + * If not a normal job, must be a sync request. Note that + * pending_sync is a standalone flag (unlike pending_job), so + * it doesn't require explicit memory barriers. + */ + assert(qatomic_read(&p->pending_sync)); + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + } + + qatomic_set(&p->pending_sync, false); + qemu_sem_post(&p->sem_sync); } } @@ -750,62 +956,37 @@ out: if (ret) { assert(local_err); trace_multifd_send_error(p->id); - multifd_send_terminate_threads(local_err); - qemu_sem_post(&p->sem_sync); - qemu_sem_post(&multifd_send_state->channels_ready); + multifd_send_set_error(local_err); + multifd_send_kick_main(p); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); - rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp); - -static void multifd_tls_outgoing_handshake(QIOTask *task, - gpointer opaque) -{ - MultiFDSendParams *p = opaque; - QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); - Error *err = NULL; - - if (!qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_complete(ioc); - if (multifd_channel_connect(p, ioc, &err)) { - return; - } - } - - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); -} +typedef struct { + MultiFDSendParams *p; + QIOChannelTLS *tioc; +} MultiFDTLSThreadArgs; static void *multifd_tls_handshake_thread(void *opaque) { - MultiFDSendParams *p = opaque; - QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + MultiFDTLSThreadArgs *args = opaque; - qio_channel_tls_handshake(tioc, - multifd_tls_outgoing_handshake, - p, + qio_channel_tls_handshake(args->tioc, + multifd_new_send_channel_async, + args->p, NULL, NULL); + g_free(args); + return NULL; } @@ -815,6 +996,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, { MigrationState *s = migrate_get_current(); const char *hostname = s->hostname; + MultiFDTLSThreadArgs *args; QIOChannelTLS *tioc; tioc = migration_tls_client_create(ioc, hostname, errp); @@ -822,76 +1004,91 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return false; } + /* + * Ownership of the socket channel now transfers to the newly + * created TLS channel, which has already taken a reference. + */ object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); - p->c = QIO_CHANNEL(tioc); - qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", - multifd_tls_handshake_thread, p, + + args = g_new0(MultiFDTLSThreadArgs, 1); + args->tioc = tioc; + args->p = p; + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", + multifd_tls_handshake_thread, args, QEMU_THREAD_JOINABLE); return true; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp) +static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { - trace_multifd_set_outgoing_channel( - ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname); - - if (migrate_channel_requires_tls_upgrade(ioc)) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return multifd_tls_channel_connect(p, ioc, errp); + qio_channel_set_delay(ioc, false); - } else { - migration_ioc_register_yank(ioc); - p->registered_yank = true; - p->c = ioc; - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); - } - return true; -} + migration_ioc_register_yank(ioc); + /* Setup p->c only if the channel is completely setup */ + p->c = ioc; -static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, - QIOChannel *ioc, Error *err) -{ - migrate_set_error(migrate_get_current(), err); - /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); - /* - * Although multifd_send_thread is not created, but main migration - * thread need to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(ioc)); - error_free(err); + p->thread_created = true; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); } +/* + * When TLS is enabled this function is called once to establish the + * TLS connection and a second time after the TLS handshake to create + * the multifd channel. Without TLS it goes straight into the channel + * creation. + */ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; + bool ret; trace_multifd_new_send_channel_async(p->id); - if (!qio_task_propagate_error(task, &local_err)) { - qio_channel_set_delay(ioc, false); - p->running = true; - if (multifd_channel_connect(p, ioc, &local_err)) { + + if (qio_task_propagate_error(task, &local_err)) { + ret = false; + goto out; + } + + trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname); + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); + if (ret) { return; } + } else { + multifd_channel_connect(p, ioc); + ret = true; + } + +out: + /* + * Here we're not interested whether creation succeeded, only that + * it happened at all. + */ + qemu_sem_post(&multifd_send_state->channels_created); + + if (ret) { + return; } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_new_send_channel_cleanup(p, ioc, local_err); + multifd_send_set_error(local_err); + /* + * For error cases (TLS or non-TLS), IO channel is always freed here + * rather than when cleanup multifd: since p->c is not set, multifd + * cleanup code doesn't even know its existence. + */ + object_unref(OBJECT(ioc)); + error_free(local_err); } static void multifd_new_send_channel_create(gpointer opaque) @@ -899,20 +1096,24 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_save_setup(Error **errp) +bool multifd_send_setup(void) { - int thread_count; + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { - return 0; + return true; } thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -920,59 +1121,53 @@ int multifd_save_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; - p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); - p->normal = g_new0(ram_addr_t, page_count); p->page_size = qemu_target_page_size(); p->page_count = page_count; - - if (migrate_zero_copy_send()) { - p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; - } else { - p->write_flags = 0; - } - + p->write_flags = 0; multifd_new_send_channel_create(p); } + /* + * Wait until channel creation has started for all channels. The + * creation can still fail, but no more channels will be created + * past this point. + */ + for (i = 0; i < thread_count; i++) { + qemu_sem_wait(&multifd_send_state->channels_created); + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - Error *local_err = NULL; - int ret; ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - error_propagate(errp, local_err); - return ret; + break; } } - return 0; -} -struct { - MultiFDRecvParams *params; - /* number of created threads */ - int count; - /* syncs main thread and channels */ - QemuSemaphore sem_sync; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_recv_state; + if (ret) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; + } + + return true; +} static void multifd_recv_terminate_threads(Error *err) { @@ -980,6 +1175,10 @@ static void multifd_recv_terminate_threads(Error *err) trace_multifd_recv_terminate_threads(err != NULL); + if (qatomic_xchg(&multifd_recv_state->exiting, 1)) { + return; + } + if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -993,8 +1192,14 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + if (multifd_use_packets()) { + qemu_sem_post(&p->sem_sync); + } + /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1004,18 +1209,45 @@ static void multifd_recv_terminate_threads(Error *err) if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } -void multifd_load_shutdown(void) +void multifd_recv_shutdown(void) { if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); } } -void multifd_load_cleanup(void) +static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) +{ + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->normal); + p->normal = NULL; + g_free(p->zero); + p->zero = NULL; + multifd_recv_state->ops->recv_cleanup(p); +} + +static void multifd_recv_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; +} + +void multifd_recv_cleanup(void) { int i; @@ -1026,56 +1258,39 @@ void multifd_load_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (p->running) { - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); + if (p->thread_created) { + qemu_thread_join(&p->thread); } - - qemu_thread_join(&p->thread); } for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - - migration_ioc_unregister_yank(p->c); - object_unref(OBJECT(p->c)); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_recv_state->ops->recv_cleanup(p); + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); } - qemu_sem_destroy(&multifd_recv_state->sem_sync); - g_free(multifd_recv_state->params); - multifd_recv_state->params = NULL; - g_free(multifd_recv_state); - multifd_recv_state = NULL; + multifd_recv_cleanup_state(); } void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); int i; - if (!migrate_multifd()) { + if (!migrate_multifd() || !multifd_use_packets()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * Initiate the synchronization by waiting for all channels. + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { @@ -1093,50 +1308,54 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; + bool has_data = false; + p->normal_num = 0; - if (p->quit) { + if (multifd_recv_should_exit()) { break; } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + if (use_packets) { + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); - break; } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, - p->next_packet_size); - p->num_packets++; - p->total_normal_pages += p->normal_num; - qemu_mutex_unlock(&p->mutex); - - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } } } @@ -1144,20 +1363,20 @@ static void *multifd_recv_thread(void *opaque) multifd_recv_terminate_threads(local_err); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } -int multifd_load_setup(Error **errp) +int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1172,6 +1391,7 @@ int multifd_load_setup(Error **errp) multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); qatomic_set(&multifd_recv_state->count, 0); + qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1180,26 +1400,26 @@ int multifd_load_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); - p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_recv_state->ops->recv_setup(p, &local_err); + ret = multifd_recv_state->ops->recv_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } @@ -1230,18 +1450,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + /* next patch gives this a meaningful value */ + id = 0; } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { @@ -1253,11 +1479,22 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } p->c = ioc; object_ref(OBJECT(ioc)); - /* initial packet */ - p->num_packets = 1; - p->running = true; + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_prepare_header(p); + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index a835643b48c85ccbfc20d705eb67af5b09323d17..57c13347881eb557f1dcb032a6674cb3ca0e8036 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,26 +13,31 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_save_setup(Error **errp); -void multifd_save_cleanup(void); -int multifd_load_setup(Error **errp); -void multifd_load_cleanup(void); -void multifd_load_shutdown(void); +#include "ram.h" + +bool multifd_send_setup(void); +void multifd_send_shutdown(void); +int multifd_recv_setup(Error **errp); +void multifd_recv_cleanup(void); +void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(QEMUFile *f); -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); +int multifd_send_sync_main(void); +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 3 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1) +/* We reserve 5 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) +#define MULTIFD_FLAG_QPL (4 << 1) +#define MULTIFD_FLAG_UADK (8 << 1) +#define MULTIFD_FLAG_QATZIP (16 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) @@ -48,18 +53,26 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; - /* global number of generated multifd packets */ - uint64_t packet_num; /* offset of each page */ ram_addr_t *offset; RAMBlock *block; @@ -75,10 +88,11 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; + QemuThread tls_thread; + bool tls_thread_created; /* communication channel */ QIOChannel *c; - /* is the yank function registered */ - bool registered_yank; /* packet allocated len */ uint32_t packet_len; /* guest page size */ @@ -93,18 +107,19 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* this mutex protects the following parameters */ - QemuMutex mutex; - /* is this channel thread running */ - bool running; - /* should this thread finish */ - bool quit; /* multifd flags for each packet */ uint32_t flags; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* thread has work to do */ - int pending_job; + /* + * The sender thread has work to do if either of below boolean is set. + * + * @pending_job: a job is pending + * @pending_sync: a sync request is pending + * + * For both of these fields, they're only set by the requesters, and + * cleared by the multifd sender threads. + */ + bool pending_job; + bool pending_sync; /* array of pages to sent. * The owner of 'pages' depends of 'pending_job' value: * pending_job == 0 -> migration_thread can use it. @@ -119,19 +134,17 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ - uint64_t num_packets; + uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ uint32_t iovs_num; - /* Pages that are not zero */ - ram_addr_t *normal; - /* num of non zero pages */ - uint32_t normal_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -144,6 +157,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; /* communication channel */ QIOChannel *c; /* packet allocated len */ @@ -158,8 +172,6 @@ typedef struct { /* this mutex protects the following parameters */ QemuMutex mutex; - /* is this channel thread running */ - bool running; /* should this thread finish */ bool quit; /* multifd flags for each packet */ @@ -173,22 +185,28 @@ typedef struct { MultiFDPacket_t *packet; /* size of the next packet that contains pages */ uint32_t next_packet_size; - /* packets sent through this channel */ - uint64_t num_packets; + /* packets received through this channel */ + uint64_t packets_recved; /* ramblock */ RAMBlock *block; /* ramblock host address */ uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { @@ -202,11 +220,22 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); -#endif +static inline void multifd_send_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + +#endif diff --git a/migration/options.c b/migration/options.c index 8d8ec73ad95bd1dc7db9cc9881099524085544ae..6ba7ff65a379e5bed488df699d5e80c0b2072d55 100644 --- a/migration/options.c +++ b/migration/options.c @@ -47,6 +47,7 @@ #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 +#define DEFAULT_MIGRATE_COMPRESS_METHOD COMPRESS_METHOD_ZLIB /* Define default autoconverge cpu throttle migration parameters */ #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 @@ -62,6 +63,13 @@ #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +/* + * 1: best speed, ... 9: best compress ratio + * There is some nuance here. Refer to QATzip documentation to understand + * the mapping of QATzip levels to standard deflate levels. + */ +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 @@ -113,6 +121,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("x-decompress-threads", MigrationState, parameters.decompress_threads, DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT), + DEFINE_PROP_COMPRESS_METHOD("compress-method", MigrationState, + parameters.compress_method, + DEFAULT_MIGRATE_COMPRESS_METHOD), DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState, parameters.throttle_trigger_threshold, DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD), @@ -143,6 +154,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, parameters.multifd_zlib_level, DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), + DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, + parameters.multifd_qatzip_level, + DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, parameters.multifd_zstd_level, DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), @@ -179,6 +193,15 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_LEGACY), + DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), + DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), + DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), + DEFINE_PROP_UINT8("hdbss-buffer-size", MigrationState, + parameters.hdbss_buffer_size, + DEFAULT_HDBSS_BUFFER_SIZE), /* Migration capabilities */ DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), @@ -795,6 +818,15 @@ int migrate_decompress_threads(void) return s->parameters.decompress_threads; } +CompressMethod migrate_compress_method(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->parameters.compress_method; +} + uint64_t migrate_downtime_limit(void) { MigrationState *s = migrate_get_current(); @@ -837,6 +869,13 @@ MigMode migrate_mode(void) return s->parameters.mode; } +int migrate_hdbss_buffer_size(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.hdbss_buffer_size; +} + int migrate_multifd_channels(void) { MigrationState *s = migrate_get_current(); @@ -859,6 +898,13 @@ int migrate_multifd_zlib_level(void) return s->parameters.multifd_zlib_level; } +int migrate_multifd_qatzip_level(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_qatzip_level; +} + int migrate_multifd_zstd_level(void) { MigrationState *s = migrate_get_current(); @@ -901,6 +947,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -953,6 +1006,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->compress_wait_thread = s->parameters.compress_wait_thread; params->has_decompress_threads = true; params->decompress_threads = s->parameters.decompress_threads; + params->has_compress_method = true; + params->compress_method = s->parameters.compress_method; params->has_throttle_trigger_threshold = true; params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold; params->has_cpu_throttle_initial = true; @@ -981,6 +1036,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->multifd_compression = s->parameters.multifd_compression; params->has_multifd_zlib_level = true; params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_qatzip_level = true; + params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; params->has_multifd_zstd_level = true; params->multifd_zstd_level = s->parameters.multifd_zstd_level; params->has_xbzrle_cache_size = true; @@ -997,6 +1054,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->announce_rounds = s->parameters.announce_rounds; params->has_announce_step = true; params->announce_step = s->parameters.announce_step; + params->sev_pdh = g_strdup(s->parameters.sev_pdh); + params->sev_plat_cert = g_strdup(s->parameters.sev_plat_cert); + params->sev_amd_cert = g_strdup(s->parameters.sev_amd_cert); if (s->parameters.has_block_bitmap_mapping) { params->has_block_bitmap_mapping = true; @@ -1011,6 +1071,10 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; + params->has_hdbss_buffer_size = true; + params->hdbss_buffer_size = s->parameters.hdbss_buffer_size; return params; } @@ -1025,6 +1089,7 @@ void migrate_params_init(MigrationParameters *params) params->has_compress_threads = true; params->has_compress_wait_thread = true; params->has_decompress_threads = true; + params->has_compress_method = true; params->has_throttle_trigger_threshold = true; params->has_cpu_throttle_initial = true; params->has_cpu_throttle_increment = true; @@ -1036,6 +1101,7 @@ void migrate_params_init(MigrationParameters *params) params->has_multifd_channels = true; params->has_multifd_compression = true; params->has_multifd_zlib_level = true; + params->has_multifd_qatzip_level = true; params->has_multifd_zstd_level = true; params->has_xbzrle_cache_size = true; params->has_max_postcopy_bandwidth = true; @@ -1047,6 +1113,39 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; + params->has_hdbss_buffer_size = true; + + params->sev_pdh = g_strdup(""); + params->sev_plat_cert = g_strdup(""); + params->sev_amd_cert = g_strdup(""); +} + +static bool compress_level_check(MigrationParameters *params, Error **errp) +{ + switch (params->compress_method) { + case COMPRESS_METHOD_ZLIB: + if (params->compress_level > 9 || params->compress_level < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", + "a value in the range of 0 to 9 for Zlib method"); + return false; + } + break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + if (params->compress_level > 19 || params->compress_level < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", + "a value in the range of 1 to 19 for Zstd method"); + return false; + } + break; +#endif + default: + error_setg(errp, "Checking compress_level failed for unknown reason"); + return false; + } + + return true; } /* @@ -1055,10 +1154,7 @@ void migrate_params_init(MigrationParameters *params) */ bool migrate_params_check(MigrationParameters *params, Error **errp) { - if (params->has_compress_level && - (params->compress_level > 9)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "a value between 0 and 9"); + if (params->has_compress_level && !compress_level_check(params, errp)) { return false; } @@ -1145,6 +1241,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_multifd_qatzip_level && + ((params->multifd_qatzip_level > 9) || + (params->multifd_qatzip_level < 1))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", + "a value between 1 and 9"); + return false; + } + if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", @@ -1259,6 +1363,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->decompress_threads = params->decompress_threads; } + if (params->has_compress_method) { + dest->compress_method = params->compress_method; + } + if (params->has_throttle_trigger_threshold) { dest->throttle_trigger_threshold = params->throttle_trigger_threshold; } @@ -1310,6 +1418,15 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + dest->multifd_qatzip_level = params->multifd_qatzip_level; + } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1348,6 +1465,27 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_mode) { dest->mode = params->mode; } + + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } + + if (params->sev_pdh) { + assert(params->sev_pdh->type == QTYPE_QSTRING); + dest->sev_pdh = params->sev_pdh->u.s; + } + if (params->sev_plat_cert) { + assert(params->sev_plat_cert->type == QTYPE_QSTRING); + dest->sev_plat_cert = params->sev_plat_cert->u.s; + } + if (params->sev_amd_cert) { + assert(params->sev_amd_cert->type == QTYPE_QSTRING); + dest->sev_amd_cert = params->sev_amd_cert->u.s; + } + + if (params->has_hdbss_buffer_size) { + dest->hdbss_buffer_size = params->hdbss_buffer_size; + } } static void migrate_params_apply(MigrateSetParameters *params, Error **errp) @@ -1380,6 +1518,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.decompress_threads = params->decompress_threads; } + if (params->has_compress_method) { + s->parameters.compress_method = params->compress_method; + } + if (params->has_throttle_trigger_threshold) { s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold; } @@ -1445,6 +1587,15 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; + } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); @@ -1492,6 +1643,30 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_mode) { s->parameters.mode = params->mode; } + + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } + + if (params->sev_pdh) { + g_free(s->parameters.sev_pdh); + assert(params->sev_pdh->type == QTYPE_QSTRING); + s->parameters.sev_pdh = g_strdup(params->sev_pdh->u.s); + } + if (params->sev_plat_cert) { + g_free(s->parameters.sev_plat_cert); + assert(params->sev_plat_cert->type == QTYPE_QSTRING); + s->parameters.sev_plat_cert = g_strdup(params->sev_plat_cert->u.s); + } + if (params->sev_amd_cert) { + g_free(s->parameters.sev_amd_cert); + assert(params->sev_amd_cert->type == QTYPE_QSTRING); + s->parameters.sev_amd_cert = g_strdup(params->sev_amd_cert->u.s); + } + + if (params->has_hdbss_buffer_size) { + s->parameters.hdbss_buffer_size = params->hdbss_buffer_size; + } } void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) @@ -1517,6 +1692,27 @@ void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp) params->tls_authz->type = QTYPE_QSTRING; params->tls_authz->u.s = strdup(""); } + /* TODO Rewrite "" to null instead */ + if (params->sev_pdh + && params->sev_pdh->type == QTYPE_QNULL) { + qobject_unref(params->sev_pdh->u.n); + params->sev_pdh->type = QTYPE_QSTRING; + params->sev_pdh->u.s = strdup(""); + } + /* TODO Rewrite "" to null instead */ + if (params->sev_plat_cert + && params->sev_plat_cert->type == QTYPE_QNULL) { + qobject_unref(params->sev_plat_cert->u.n); + params->sev_plat_cert->type = QTYPE_QSTRING; + params->sev_plat_cert->u.s = strdup(""); + } + /* TODO Rewrite "" to null instead */ + if (params->sev_amd_cert + && params->sev_amd_cert->type == QTYPE_QNULL) { + qobject_unref(params->sev_amd_cert->u.n); + params->sev_amd_cert->type = QTYPE_QSTRING; + params->sev_amd_cert->u.s = strdup(""); + } migrate_params_test_apply(params, &tmp); diff --git a/migration/options.h b/migration/options.h index 246c160aeeea4c75f104e66a2008004d0ffd7693..6b2a893217f0458018d70a17d7655c86f396a03e 100644 --- a/migration/options.h +++ b/migration/options.h @@ -78,21 +78,25 @@ uint8_t migrate_cpu_throttle_increment(void); uint8_t migrate_cpu_throttle_initial(void); bool migrate_cpu_throttle_tailslow(void); int migrate_decompress_threads(void); +CompressMethod migrate_compress_method(void); uint64_t migrate_downtime_limit(void); uint8_t migrate_max_cpu_throttle(void); uint64_t migrate_max_bandwidth(void); uint64_t migrate_avail_switchover_bandwidth(void); uint64_t migrate_max_postcopy_bandwidth(void); MigMode migrate_mode(void); +int migrate_hdbss_buffer_size(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); +int migrate_multifd_qatzip_level(void); int migrate_multifd_zstd_level(void); uint8_t migrate_throttle_trigger_threshold(void); const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 94231ff2955c80b3d0fab11a40510d34c334a826..bd1dbc3db175ff75e08ef65fac7978ac802dafb7 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -669,55 +669,6 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* return the size after compression, or negative value on error */ -static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len, - const uint8_t *source, size_t source_len) -{ - int err; - - err = deflateReset(stream); - if (err != Z_OK) { - return -1; - } - - stream->avail_in = source_len; - stream->next_in = (uint8_t *)source; - stream->avail_out = dest_len; - stream->next_out = dest; - - err = deflate(stream, Z_FINISH); - if (err != Z_STREAM_END) { - return -1; - } - - return stream->next_out - dest; -} - -/* Compress size bytes of data start at p and store the compressed - * data to the buffer of f. - * - * Since the file is dummy file with empty_ops, return -1 if f has no space to - * save the compressed data. - */ -ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, - const uint8_t *p, size_t size) -{ - ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); - - if (blen < compressBound(size)) { - return -1; - } - - blen = qemu_compress_data(stream, f->buf + f->buf_index + sizeof(int32_t), - blen, p, size); - if (blen < 0) { - return -1; - } - - qemu_put_be32(f, blen); - add_buf_to_iovec(f, blen); - return blen + sizeof(int32_t); -} /* Put the data in the buffer of f_src to the buffer of f_des, and * then reset the buf_index of f_src to 0. @@ -834,3 +785,15 @@ int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size) return 0; } + +ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr) +{ + *dest_ptr = f->buf + f->buf_index + sizeof(int32_t); + return IO_BUF_SIZE - f->buf_index - sizeof(int32_t); +} + +void qemu_put_compress_end(QEMUFile *f, unsigned int v) +{ + qemu_put_be32(f, v); + add_buf_to_iovec(f, v); +} diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 8aec9fabf7f15bc05ef8bc31eef8928ec36827b8..8afa95732bc8b4ba59792c599c699d95585c9448 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -54,8 +54,8 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size, size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset); size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size); -ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream, - const uint8_t *p, size_t size); +ssize_t qemu_put_compress_start(QEMUFile *f, uint8_t **dest_ptr); +void qemu_put_compress_end(QEMUFile *f, unsigned int v); int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src); bool qemu_file_buffer_empty(QEMUFile *file); diff --git a/migration/ram-compress.c b/migration/ram-compress.c index fa4388f6a61cc420bd2450038d7410a6a4e5694a..74703f0ec4593f1cd60b8d497f6e63383f68e0ee 100644 --- a/migration/ram-compress.c +++ b/migration/ram-compress.c @@ -28,7 +28,6 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" - #include "ram-compress.h" #include "qemu/error-report.h" @@ -40,6 +39,7 @@ #include "exec/ramblock.h" #include "ram.h" #include "migration-stats.h" +#include "exec/ram_addr.h" static struct { int64_t pages; @@ -65,46 +65,293 @@ static QemuThread *compress_threads; static QemuMutex comp_done_lock; static QemuCond comp_done_cond; -struct DecompressParam { - bool done; - bool quit; - QemuMutex mutex; - QemuCond cond; - void *des; - uint8_t *compbuf; - int len; - z_stream stream; -}; -typedef struct DecompressParam DecompressParam; - static QEMUFile *decomp_file; static DecompressParam *decomp_param; static QemuThread *decompress_threads; +MigrationCompressOps *compress_ops; +MigrationDecompressOps *decompress_ops; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; -static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, - RAMBlock *block, ram_addr_t offset, - uint8_t *source_buf); +static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block); + +static int zlib_save_setup(CompressParam *param) +{ + if (deflateInit(¶m->stream, + migrate_compress_level()) != Z_OK) { + return -1; + } + + return 0; +} + +static ssize_t zlib_compress_data(CompressParam *param, size_t size) +{ + int err; + uint8_t *dest = NULL; + z_stream *stream = ¶m->stream; + uint8_t *p = param->originbuf; + QEMUFile *f = f = param->file; + ssize_t blen = qemu_put_compress_start(f, &dest); + + if (blen < compressBound(size)) { + return -1; + } + + err = deflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = size; + stream->next_in = p; + stream->avail_out = blen; + stream->next_out = dest; + + err = deflate(stream, Z_FINISH); + if (err != Z_STREAM_END) { + return -1; + } + + blen = stream->next_out - dest; + if (blen < 0) { + return -1; + } + + qemu_put_compress_end(f, blen); + return blen + sizeof(int32_t); +} + +static void zlib_save_cleanup(CompressParam *param) +{ + deflateEnd(¶m->stream); +} + +static int zlib_load_setup(DecompressParam *param) +{ + if (inflateInit(¶m->stream) != Z_OK) { + return -1; + } + + return 0; +} + +static int +zlib_decompress_data(DecompressParam *param, uint8_t *dest, size_t size) +{ + int err; + + z_stream *stream = ¶m->stream; + + err = inflateReset(stream); + if (err != Z_OK) { + return -1; + } + + stream->avail_in = param->len; + stream->next_in = param->compbuf; + stream->avail_out = size; + stream->next_out = dest; + + err = inflate(stream, Z_NO_FLUSH); + if (err != Z_STREAM_END) { + return -1; + } + + return stream->total_out; +} + +static void zlib_load_cleanup(DecompressParam *param) +{ + inflateEnd(¶m->stream); +} + +static int zlib_check_len(int len) +{ + return len < 0 || len > compressBound(TARGET_PAGE_SIZE); +} + +#ifdef CONFIG_ZSTD +static int zstd_save_setup(CompressParam *param) +{ + int res; + param->zstd_cs = ZSTD_createCStream(); + if (!param->zstd_cs) { + return -1; + } + res = ZSTD_initCStream(param->zstd_cs, migrate_compress_level()); + if (ZSTD_isError(res)) { + return -1; + } + return 0; +} +static void zstd_save_cleanup(CompressParam *param) +{ + ZSTD_freeCStream(param->zstd_cs); + param->zstd_cs = NULL; +} +static ssize_t zstd_compress_data(CompressParam *param, size_t size) +{ + int ret; + uint8_t *dest = NULL; + uint8_t *p = param->originbuf; + QEMUFile *f = f = param->file; + ssize_t blen = qemu_put_compress_start(f, &dest); + if (blen < ZSTD_compressBound(size)) { + return -1; + } + param->out.dst = dest; + param->out.size = blen; + param->out.pos = 0; + param->in.src = p; + param->in.size = size; + param->in.pos = 0; + do { + ret = ZSTD_compressStream2(param->zstd_cs, ¶m->out, + ¶m->in, ZSTD_e_end); + } while (ret > 0 && (param->in.size - param->in.pos > 0) + && (param->out.size - param->out.pos > 0)); + if (ret > 0 && (param->in.size - param->in.pos > 0)) { + return -1; + } + if (ZSTD_isError(ret)) { + return -1; + } + blen = param->out.pos; + qemu_put_compress_end(f, blen); + return blen + sizeof(int32_t); +} + +static int zstd_load_setup(DecompressParam *param) +{ + int ret; + param->zstd_ds = ZSTD_createDStream(); + if (!param->zstd_ds) { + return -1; + } + ret = ZSTD_initDStream(param->zstd_ds); + if (ZSTD_isError(ret)) { + return -1; + } + return 0; +} +static void zstd_load_cleanup(DecompressParam *param) +{ + ZSTD_freeDStream(param->zstd_ds); + param->zstd_ds = NULL; +} +static int +zstd_decompress_data(DecompressParam *param, uint8_t *dest, size_t size) +{ + int ret; + param->out.dst = dest; + param->out.size = size; + param->out.pos = 0; + param->in.src = param->compbuf; + param->in.size = param->len; + param->in.pos = 0; + do { + ret = ZSTD_decompressStream(param->zstd_ds, ¶m->out, ¶m->in); + } while (ret > 0 && (param->in.size - param->in.pos > 0) + && (param->out.size - param->out.pos > 0)); + if (ret > 0 && (param->in.size - param->in.pos > 0)) { + return -1; + } + if (ZSTD_isError(ret)) { + return -1; + } + return ret; +} +static int zstd_check_len(int len) +{ + return len < 0 || len > ZSTD_compressBound(TARGET_PAGE_SIZE); +} +#endif + +static int set_compress_ops(void) +{ + compress_ops = g_new0(MigrationCompressOps, 1); + + switch (migrate_compress_method()) { + case COMPRESS_METHOD_ZLIB: + compress_ops->save_setup = zlib_save_setup; + compress_ops->save_cleanup = zlib_save_cleanup; + compress_ops->compress_data = zlib_compress_data; + break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + compress_ops->save_setup = zstd_save_setup; + compress_ops->save_cleanup = zstd_save_cleanup; + compress_ops->compress_data = zstd_compress_data; + break; +#endif + default: + return -1; + } + + return 0; +} + +static int set_decompress_ops(void) +{ + decompress_ops = g_new0(MigrationDecompressOps, 1); + + switch (migrate_compress_method()) { + case COMPRESS_METHOD_ZLIB: + decompress_ops->load_setup = zlib_load_setup; + decompress_ops->load_cleanup = zlib_load_cleanup; + decompress_ops->decompress_data = zlib_decompress_data; + decompress_ops->check_len = zlib_check_len; + break; +#ifdef CONFIG_ZSTD + case COMPRESS_METHOD_ZSTD: + decompress_ops->load_setup = zstd_load_setup; + decompress_ops->load_cleanup = zstd_load_cleanup; + decompress_ops->decompress_data = zstd_decompress_data; + decompress_ops->check_len = zstd_check_len; + break; +#endif + default: + return -1; + } + + return 0; +} + +static void clean_compress_ops(void) +{ + compress_ops->save_setup = NULL; + compress_ops->save_cleanup = NULL; + compress_ops->compress_data = NULL; + + g_free(compress_ops); + compress_ops = NULL; +} + +static void clean_decompress_ops(void) +{ + decompress_ops->load_setup = NULL; + decompress_ops->load_cleanup = NULL; + decompress_ops->decompress_data = NULL; + + g_free(decompress_ops); + decompress_ops = NULL; +} static void *do_data_compress(void *opaque) { CompressParam *param = opaque; RAMBlock *block; - ram_addr_t offset; CompressResult result; qemu_mutex_lock(¶m->mutex); while (!param->quit) { if (param->trigger) { block = param->block; - offset = param->offset; param->trigger = false; qemu_mutex_unlock(¶m->mutex); - result = do_compress_ram_page(param->file, ¶m->stream, - block, offset, param->originbuf); - + result = do_compress_ram_page(param, block); qemu_mutex_lock(&comp_done_lock); param->done = true; param->result = result; @@ -147,7 +394,7 @@ void compress_threads_save_cleanup(void) qemu_thread_join(compress_threads + i); qemu_mutex_destroy(&comp_param[i].mutex); qemu_cond_destroy(&comp_param[i].cond); - deflateEnd(&comp_param[i].stream); + compress_ops->save_cleanup(&comp_param[i]); g_free(comp_param[i].originbuf); qemu_fclose(comp_param[i].file); comp_param[i].file = NULL; @@ -158,6 +405,7 @@ void compress_threads_save_cleanup(void) g_free(comp_param); compress_threads = NULL; comp_param = NULL; + clean_compress_ops(); } int compress_threads_save_setup(void) @@ -167,6 +415,12 @@ int compress_threads_save_setup(void) if (!migrate_compress()) { return 0; } + + if (set_compress_ops() < 0) { + clean_compress_ops(); + return -1; + } + thread_count = migrate_compress_threads(); compress_threads = g_new0(QemuThread, thread_count); comp_param = g_new0(CompressParam, thread_count); @@ -178,8 +432,7 @@ int compress_threads_save_setup(void) goto exit; } - if (deflateInit(&comp_param[i].stream, - migrate_compress_level()) != Z_OK) { + if (compress_ops->save_setup(&comp_param[i]) < 0) { g_free(comp_param[i].originbuf); goto exit; } @@ -204,15 +457,13 @@ exit: return -1; } -static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, - RAMBlock *block, ram_addr_t offset, - uint8_t *source_buf) +static CompressResult do_compress_ram_page(CompressParam *param, RAMBlock *block) { - uint8_t *p = block->host + offset; + uint8_t *p = block->host + (param->offset & TARGET_PAGE_MASK); size_t page_size = qemu_target_page_size(); int ret; - assert(qemu_file_buffer_empty(f)); + assert(qemu_file_buffer_empty(param->file)); if (buffer_is_zero(p, page_size)) { return RES_ZEROPAGE; @@ -223,12 +474,12 @@ static CompressResult do_compress_ram_page(QEMUFile *f, z_stream *stream, * so that we can catch up the error during compression and * decompression */ - memcpy(source_buf, p, page_size); - ret = qemu_put_compression_data(f, stream, source_buf, page_size); + memcpy(param->originbuf, p, page_size); + ret = compress_ops->compress_data(param, page_size); if (ret < 0) { qemu_file_set_error(migrate_get_current()->to_dst_file, ret); error_report("compressed data failed!"); - qemu_fflush(f); + qemu_fflush(param->file); return RES_NONE; } return RES_COMPRESS; @@ -320,50 +571,23 @@ bool compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset, } } -/* return the size after decompression, or negative value on error */ -static int -qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len, - const uint8_t *source, size_t source_len) -{ - int err; - - err = inflateReset(stream); - if (err != Z_OK) { - return -1; - } - - stream->avail_in = source_len; - stream->next_in = (uint8_t *)source; - stream->avail_out = dest_len; - stream->next_out = dest; - - err = inflate(stream, Z_NO_FLUSH); - if (err != Z_STREAM_END) { - return -1; - } - - return stream->total_out; -} - static void *do_data_decompress(void *opaque) { DecompressParam *param = opaque; unsigned long pagesize; uint8_t *des; - int len, ret; + int ret; qemu_mutex_lock(¶m->mutex); while (!param->quit) { if (param->des) { des = param->des; - len = param->len; param->des = 0; qemu_mutex_unlock(¶m->mutex); pagesize = qemu_target_page_size(); - ret = qemu_uncompress_data(¶m->stream, des, pagesize, - param->compbuf, len); + ret = decompress_ops->decompress_data(param, des, pagesize); if (ret < 0 && migrate_get_current()->decompress_error_check) { error_report("decompress data failed"); qemu_file_set_error(decomp_file, ret); @@ -431,7 +655,7 @@ void compress_threads_load_cleanup(void) qemu_thread_join(decompress_threads + i); qemu_mutex_destroy(&decomp_param[i].mutex); qemu_cond_destroy(&decomp_param[i].cond); - inflateEnd(&decomp_param[i].stream); + decompress_ops->load_cleanup(&decomp_param[i]); g_free(decomp_param[i].compbuf); decomp_param[i].compbuf = NULL; } @@ -440,6 +664,7 @@ void compress_threads_load_cleanup(void) decompress_threads = NULL; decomp_param = NULL; decomp_file = NULL; + clean_decompress_ops(); } int compress_threads_load_setup(QEMUFile *f) @@ -450,6 +675,11 @@ int compress_threads_load_setup(QEMUFile *f) return 0; } + if (set_decompress_ops() < 0) { + clean_decompress_ops(); + return -1; + } + /* * set compression_counters memory to zero for a new migration */ @@ -462,7 +692,7 @@ int compress_threads_load_setup(QEMUFile *f) qemu_cond_init(&decomp_done_cond); decomp_file = f; for (i = 0; i < thread_count; i++) { - if (inflateInit(&decomp_param[i].stream) != Z_OK) { + if (decompress_ops->load_setup(&decomp_param[i]) < 0) { goto exit; } diff --git a/migration/ram-compress.h b/migration/ram-compress.h index 0d89a2f55e763e3a4eedb2724600154a016b37e7..e8700eb36faa6eb020c9a2f212ad3549da3881a7 100644 --- a/migration/ram-compress.h +++ b/migration/ram-compress.h @@ -29,6 +29,10 @@ #ifndef QEMU_MIGRATION_COMPRESS_H #define QEMU_MIGRATION_COMPRESS_H +#ifdef CONFIG_ZSTD +#include +#include +#endif #include "qemu-file.h" #include "qapi/qapi-types-migration.h" @@ -39,6 +43,25 @@ enum CompressResult { }; typedef enum CompressResult CompressResult; +struct DecompressParam { + bool done; + bool quit; + QemuMutex mutex; + QemuCond cond; + void *des; + uint8_t *compbuf; + int len; + + /* for zlib compression */ + z_stream stream; +#ifdef CONFIG_ZSTD + ZSTD_DStream *zstd_ds; + ZSTD_inBuffer in; + ZSTD_outBuffer out; +#endif +}; +typedef struct DecompressParam DecompressParam; + struct CompressParam { bool done; bool quit; @@ -51,11 +74,32 @@ struct CompressParam { ram_addr_t offset; /* internally used fields */ - z_stream stream; uint8_t *originbuf; + + /* for zlib compression */ + z_stream stream; + +#ifdef CONFIG_ZSTD + ZSTD_CStream *zstd_cs; + ZSTD_inBuffer in; + ZSTD_outBuffer out; +#endif }; typedef struct CompressParam CompressParam; +typedef struct { + int (*save_setup)(CompressParam *param); + void (*save_cleanup)(CompressParam *param); + ssize_t (*compress_data)(CompressParam *param, size_t size); +} MigrationCompressOps; + +typedef struct { + int (*load_setup)(DecompressParam *param); + void (*load_cleanup)(DecompressParam *param); + int (*decompress_data)(DecompressParam *param, uint8_t *dest, size_t size); + int (*check_len)(int len); +} MigrationDecompressOps; + void compress_threads_save_cleanup(void); int compress_threads_save_setup(void); diff --git a/migration/ram.c b/migration/ram.c index 8c7886ab797b8a91d9ecf060e8ca016bd436ae46..91bec89a6e80d74712594809a1cbe9dada709c89 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -39,6 +39,7 @@ #include "migration-stats.h" #include "migration/register.h" #include "migration/misc.h" +#include "migration/options.h" #include "qemu-file.h" #include "postcopy-ram.h" #include "page_cache.h" @@ -63,6 +64,12 @@ #include "options.h" #include "sysemu/dirtylimit.h" #include "sysemu/kvm.h" +#include "exec/confidential-guest-support.h" + +/* Defines RAM_SAVE_ENCRYPTED_PAGE and RAM_SAVE_SHARED_REGION_LIST */ +#include "target/i386/sev.h" +#include "target/i386/csv.h" +#include "sysemu/kvm.h" #include "hw/boards.h" /* for machine_dump_guest_core() */ @@ -92,10 +99,21 @@ /* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */ #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 -/* We can't use any flag that is bigger than 0x200 */ +#define RAM_SAVE_FLAG_ENCRYPTED_DATA 0x400 + +bool memcrypt_enabled(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + if(ms->cgs) + return ms->cgs->ready; + else + return false; +} XBZRLECacheStats xbzrle_counters; +extern MigrationDecompressOps *decompress_ops; + /* used by the search for pages to send */ struct PageSearchStatus { /* The migration channel used for a specific host page */ @@ -257,6 +275,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, nr); } +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) +{ + set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); +} #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) /* @@ -1123,6 +1145,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } @@ -1204,6 +1230,125 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, return 1; } +/** + * ram_save_encrypted_page - send the given encrypted page to the stream + */ +static int ram_save_encrypted_page(RAMState *rs, PageSearchStatus *pss) +{ + QEMUFile *file = pss->pss_channel; + int ret; + uint8_t *p; + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + uint64_t bytes_xmit = 0; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + p = block->host + offset; + trace_ram_save_page(block->idstr, (uint64_t)offset, p); + + ram_transferred_add(save_page_header(pss, file, block, + offset | RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(file, RAM_SAVE_ENCRYPTED_PAGE); + ret = ops->save_outgoing_page(file, p, TARGET_PAGE_SIZE, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(4 + bytes_xmit); + stat64_add(&mig_stats.normal_pages, 1); + + return 1; +} + +/** + * ram_save_shared_region_list: send the shared region list + */ +static int ram_save_shared_region_list(RAMState *rs, QEMUFile *f) +{ + int ret; + uint64_t bytes_xmit = 0; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + ram_transferred_add(save_page_header(pss, f, + pss->last_sent_block, + RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(f, RAM_SAVE_SHARED_REGIONS_LIST); + ret = ops->save_outgoing_shared_regions_list(f, &bytes_xmit); + if (ret < 0) { + return ret; + } + ram_transferred_add(4 + bytes_xmit); + + return 0; +} + +/** + * ram_save_encrypted_cpu_state: send the encrypted cpu state + */ +static int ram_save_encrypted_cpu_state(RAMState *rs, QEMUFile *f) +{ + int ret; + uint64_t bytes_xmit = 0; + PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + ram_transferred_add(save_page_header(pss, f, + pss->last_sent_block, + RAM_SAVE_FLAG_ENCRYPTED_DATA)); + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_CPU_STATE); + ret = ops->save_outgoing_cpu_state(f, &bytes_xmit); + if (ret < 0) { + return ret; + } + ram_transferred_add(4 + bytes_xmit); + + return 0; +} + +static int load_encrypted_data(QEMUFile *f, uint8_t *ptr) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + int flag; + + flag = qemu_get_be32(f); + + if (flag == RAM_SAVE_ENCRYPTED_PAGE) { + return ops->load_incoming_page(f, ptr); + } else if (flag == RAM_SAVE_SHARED_REGIONS_LIST) { + return ops->load_incoming_shared_regions_list(f); + } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH) { + return ops->queue_incoming_page(f, ptr); + } else if (flag == RAM_SAVE_ENCRYPTED_PAGE_BATCH_END) { + if (ops->queue_incoming_page(f, ptr)) { + error_report("Failed to queue incoming data"); + return -EINVAL; + } + return ops->load_queued_incoming_pages(f); + } else if (flag == RAM_SAVE_ENCRYPTED_CPU_STATE) { + return ops->load_incoming_cpu_state(f); + } else { + error_report("unknown encrypted flag %x", flag); + return 1; + } +} + /** * ram_save_page: send the given page to the stream * @@ -1250,13 +1395,11 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, - ram_addr_t offset) +static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(file, block, offset) < 0) { + if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } @@ -1336,7 +1479,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_send_sync_main(f); + int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -2034,6 +2177,56 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, compress_send_queued_data); } +/** + * encrypted_test_list: check if the page is encrypted + * + * Returns a bool indicating whether the page is encrypted. + */ +static bool encrypted_test_list(RAMState *rs, RAMBlock *block, + unsigned long page) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + unsigned long gfn; + hwaddr paddr = 0; + int ret; + + /* ROM devices contains the unencrypted data */ + if (memory_region_is_rom(block->mr)) { + return false; + } + + if (!strcmp(memory_region_name(block->mr), "system.flash0")) { + return true; + } + + if (!strcmp(memory_region_name(block->mr), "system.flash1")) { + return false; + } + + if (!strcmp(memory_region_name(block->mr), "vga.vram")) { + return false; + } + + /* + * Translate page in ram_addr_t address space to GPA address + * space using memory region. + */ + if (kvm_enabled()) { + ret = kvm_physical_memory_addr_from_host(kvm_state, + block->host + (page << TARGET_PAGE_BITS), &paddr); + if (ret == 0) { + return false; + } + } + gfn = paddr >> TARGET_PAGE_BITS; + + return ops->is_gfn_in_unshared_region(gfn); +} + /** * ram_save_target_page_legacy: save one target page * @@ -2044,7 +2237,6 @@ static bool save_compress_page(RAMState *rs, PageSearchStatus *pss, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2052,6 +2244,17 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return res; } + /* + * If memory encryption is enabled then use memory encryption APIs + * to write the outgoing buffer to the wire. The encryption APIs + * will take care of accessing the guest memory and re-encrypt it + * for the transport purposes. + */ + if (memcrypt_enabled() && + encrypted_test_list(rs, pss->block, pss->page)) { + return ram_save_encrypted_page(rs, pss); + } + if (save_compress_page(rs, pss, offset)) { return 1; } @@ -2060,17 +2263,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(pss->pss_channel, block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -2177,6 +2396,196 @@ out: return ret; } +#ifdef CONFIG_HYGON_CSV_MIG_ACCEL +/** + * ram_save_encrypted_pages_in_batch: send the given encrypted pages to + * the stream. + * + * Sending pages of 4K size in batch. The saving stops at the end of + * the block. + * + * The caller must be with ram_state.bitmap_mutex held to call this + * function. + * + * Returns the number of pages written or negative on error + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int +ram_save_encrypted_pages_in_batch(RAMState *rs, PageSearchStatus *pss) +{ + bool page_dirty; + int ret; + int tmppages, pages = 0; + uint8_t *p; + uint32_t host_len = 0; + uint64_t bytes_xmit = 0; + ram_addr_t offset, start_offset = 0; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *)object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + do { + page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); + + /* Check the pages is dirty and if it is send it */ + if (page_dirty) { + /* Process the unencrypted page */ + if (!encrypted_test_list(rs, pss->block, pss->page)) { + tmppages = migration_ops->ram_save_target_page(rs, pss); + } else { + /* Caculate the offset and host virtual address of the page */ + offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + p = pss->block->host + offset; + + /* Record the offset and host virtual address of the first + * page in this loop which will be used below. + */ + if (host_len == 0) { + start_offset = offset | RAM_SAVE_FLAG_ENCRYPTED_DATA; + } else { + offset |= (RAM_SAVE_FLAG_ENCRYPTED_DATA | RAM_SAVE_FLAG_CONTINUE); + } + + /* Queue the outgoing page if the page is not zero page. + * If the queued pages are up to the outgoing page window size, + * process them below. + */ + if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset)) + return -1; + + tmppages = 1; + host_len += TARGET_PAGE_SIZE; + + stat64_add(&mig_stats.normal_pages, 1); + } + } else { + tmppages = 0; + } + + if (tmppages >= 0) { + pages += tmppages; + } else { + return tmppages; + } + + pss_find_next_dirty(pss); + } while (offset_in_ramblock(pss->block, + ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) && + host_len < CSV_OUTGOING_PAGE_WINDOW_SIZE); + + /* Check if there are any queued pages */ + if (host_len != 0) { + ram_transferred_add(save_page_header(pss, pss->pss_channel, + pss->block, start_offset)); + /* if only one page queued, flag is BATCH_END, else flag is BATCH */ + if (host_len > TARGET_PAGE_SIZE) + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH); + else + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END); + ram_transferred_add(4); + /* Process the queued pages in batch */ + ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(bytes_xmit); + } + + /* The offset we leave with is the last one we looked at */ + pss->page--; + + return pages; +} +#endif + +/** + * ram_save_csv3_pages - send the given csv3 VM pages to the stream + */ +static int ram_save_csv3_pages(RAMState *rs, PageSearchStatus *pss) +{ + bool page_dirty; + int ret; + int tmppages, pages = 0; + uint8_t *p; + uint32_t host_len = 0; + uint64_t bytes_xmit = 0; + RAMBlock *block = pss->block; + ram_addr_t offset = 0; + hwaddr paddr = RAM_ADDR_INVALID; + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + + if (!kvm_csv3_enabled()) + return 0; + + do { + page_dirty = migration_bitmap_clear_dirty(rs, block, pss->page); + + /* Check the pages is dirty and if it is send it */ + if (page_dirty) { + ret = kvm_physical_memory_addr_from_host(kvm_state, + block->host + (pss->page << TARGET_PAGE_BITS), &paddr); + /* Process ROM or MMIO */ + if (paddr == RAM_ADDR_INVALID || + memory_region_is_rom(block->mr)) { + tmppages = migration_ops->ram_save_target_page(rs, pss); + } else { + /* Caculate the offset and host virtual address of the page */ + offset = pss->page << TARGET_PAGE_BITS; + p = block->host + offset; + + if (ops->queue_outgoing_page(p, TARGET_PAGE_SIZE, offset)) + return -1; + + tmppages = 1; + host_len += TARGET_PAGE_SIZE; + + stat64_add(&mig_stats.normal_pages, 1); + } + } else { + tmppages = 0; + } + + if (tmppages >= 0) { + pages += tmppages; + } else { + return tmppages; + } + + pss_find_next_dirty(pss); + } while (offset_in_ramblock(block, + ((ram_addr_t)pss->page) << TARGET_PAGE_BITS) && + host_len < CSV3_OUTGOING_PAGE_WINDOW_SIZE); + + /* Check if there are any queued pages */ + if (host_len != 0) { + /* Always set offset as 0 for csv3. */ + ram_transferred_add(save_page_header(pss, pss->pss_channel, + block, 0 | RAM_SAVE_FLAG_ENCRYPTED_DATA)); + + qemu_put_be32(pss->pss_channel, RAM_SAVE_ENCRYPTED_PAGE); + ram_transferred_add(4); + /* Process the queued pages in batch */ + ret = ops->save_queued_outgoing_pages(pss->pss_channel, &bytes_xmit); + if (ret) { + return -1; + } + ram_transferred_add(bytes_xmit); + } + + /* The offset we leave with is the last one we looked at */ + pss->page--; + + return pages; +} + /** * ram_save_host_page: save a whole host page * @@ -2212,6 +2621,21 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } + if (kvm_csv3_enabled()) + return ram_save_csv3_pages(rs, pss); + +#ifdef CONFIG_HYGON_CSV_MIG_ACCEL + /* + * If command_batch function is enabled and memory encryption is enabled + * then use command batch APIs to accelerate the sending process + * to write the outgoing buffer to the wire. The encryption APIs + * will re-encrypt the data with transport key so that data is prototect + * on the wire. + */ + if (memcrypt_enabled() && is_hygon_cpu() && !migration_in_postcopy()) + return ram_save_encrypted_pages_in_batch(rs, pss); +#endif + /* Update host page boundary information */ pss_host_page_prepare(pss); @@ -2404,6 +2828,11 @@ static void ram_save_cleanup(void *opaque) * memory_global_dirty_log_stop will assert that * memory_global_dirty_log_start/stop used in pairs */ +#ifdef TARGET_AARCH64 + if (kvm_enabled()) { + kvm_update_hdbss_cap(false, 0); + } +#endif memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); } } @@ -2807,6 +3236,11 @@ static void ram_init_bitmaps(RAMState *rs) ram_list_init_bitmaps(); /* We don't use dirty log with background snapshots */ if (!migrate_background_snapshot()) { +#ifdef TARGET_AARCH64 + if (kvm_enabled()) { + kvm_update_hdbss_cap(true, migrate_hdbss_buffer_size()); + } +#endif memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION); migration_bitmap_sync_precopy(rs, false); } @@ -2917,6 +3351,18 @@ void qemu_guest_free_page_hint(void *addr, size_t len) } } +static int ram_encrypted_save_setup(void) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(ms->cgs)); + struct ConfidentialGuestMemoryEncryptionOps *ops = + cgs_class->memory_encryption_ops; + MigrationParameters *p = &migrate_get_current()->parameters; + + return ops->save_setup(p->sev_pdh, p->sev_plat_cert, p->sev_amd_cert); +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -2952,6 +3398,13 @@ static int ram_save_setup(QEMUFile *f, void *opaque) (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; WITH_RCU_READ_LOCK_GUARD() { + + if (memcrypt_enabled()) { + if (ram_encrypted_save_setup()) { + return -1; + } + } + qemu_put_be64(f, ram_bytes_total_with_ignored() | RAM_SAVE_FLAG_MEM_SIZE); @@ -2982,10 +3435,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } qemu_mutex_unlock_iothread(); - ret = multifd_send_sync_main(f); + ret = multifd_send_sync_main(); qemu_mutex_lock_iothread(); if (ret < 0) { return ret; @@ -3109,7 +3567,7 @@ out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -3181,9 +3639,31 @@ static int ram_save_complete(QEMUFile *f, void *opaque) qemu_file_set_error(f, ret); return ret; } + + /* send the shared regions list */ + if (memcrypt_enabled()) { + ret = ram_save_shared_region_list(rs, f); + if (ret < 0) { + qemu_file_set_error(f, ret); + return ret; + } + + /* + * send the encrypted cpu state, for example, CSV2 guest's + * vmsa for each vcpu. + */ + if (is_hygon_cpu()) { + ret = ram_save_encrypted_cpu_state(rs, f); + if (ret < 0) { + error_report("Failed to save encrypted cpu state"); + qemu_file_set_error(f, ret); + return ret; + } + } + } } - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -3918,7 +4398,8 @@ static int ram_load_precopy(QEMUFile *f) } if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | - RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { + RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE | + RAM_SAVE_FLAG_ENCRYPTED_DATA)) { RAMBlock *block = ram_block_from_stream(mis, f, flags, RAM_CHANNEL_PRECOPY); @@ -3979,7 +4460,7 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_COMPRESS_PAGE: len = qemu_get_be32(f); - if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { + if (decompress_ops->check_len(len)) { error_report("Invalid compressed data length: %d", len); ret = -EINVAL; break; @@ -4011,6 +4492,12 @@ static int ram_load_precopy(QEMUFile *f) qemu_file_set_error(f, ret); } break; + case RAM_SAVE_FLAG_ENCRYPTED_DATA: + if (load_encrypted_data(f, host)) { + error_report("Failed to load encrypted data"); + ret = -EINVAL; + } + break; default: error_report("Unknown combination of migration flags: 0x%x", flags); ret = -EINVAL; diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b7382ddbea7194a88d44e9fa8b54462..cd263df02662d0d69180c22e0c6950f27f86abea 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset); int64_t ramblock_recv_bitmap_send(QEMUFile *file, const char *block_name); bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); diff --git a/migration/savevm.c b/migration/savevm.c index eec5503a422081a5e2fce7238820ec6fc8db1b8b..bde05eebe4754feb537e7b23de0bee0e48f89cd7 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -61,6 +61,7 @@ #include "sysemu/replay.h" #include "sysemu/runstate.h" #include "sysemu/sysemu.h" +#include "sysemu/kvm.h" #include "sysemu/xen.h" #include "migration/colo.h" #include "qemu/bitmap.h" @@ -1553,6 +1554,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, ret = vmstate_save(f, se, vmdesc); if (ret) { qemu_file_set_error(f, ret); + json_writer_free(vmdesc); return ret; } @@ -1572,6 +1574,7 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, migrate_set_error(ms, local_err); error_report_err(local_err); qemu_file_set_error(f, ret); + json_writer_free(vmdesc); return ret; } } @@ -2854,6 +2857,10 @@ int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) uint8_t section_type; int ret = 0; + if (qemu_mutex_iothread_locked()) { + memory_region_transaction_begin(); + } + retry: while (true) { section_type = qemu_get_byte(f); @@ -2897,6 +2904,9 @@ retry: } out: + if (qemu_mutex_iothread_locked()) { + memory_region_transaction_commit(); + } if (ret < 0) { qemu_file_set_error(f, ret); @@ -2956,7 +2966,10 @@ int qemu_loadvm_state(QEMUFile *f) trace_qemu_loadvm_state_post_main(ret); if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ + /* + * Postcopy listen thread still going, don't synchronize the + * cpus yet. + */ return ret; } @@ -2999,7 +3012,6 @@ int qemu_loadvm_state(QEMUFile *f) } } - qemu_loadvm_state_cleanup(); cpu_synchronize_all_post_init(); return ret; @@ -3042,6 +3054,11 @@ int qemu_loadvm_approve_switchover(void) bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { + if (virtcca_cvm_enabled()) { + error_setg(errp, "The savevm command is temporarily unsupported in cvm."); + return false; + } + BlockDriverState *bs; QEMUSnapshotInfo sn1, *sn = &sn1; int ret = -1, ret2; diff --git a/migration/socket.c b/migration/socket.c index 98e3ea1514719d7059c30a192ff265e1dc1ba28c..9ab89b1e089b6053078de50db067fab08ef610e3 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -60,17 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp) return QIO_CHANNEL(sioc); } -int socket_send_channel_destroy(QIOChannel *send) -{ - /* Remove channel */ - object_unref(OBJECT(send)); - if (outgoing_args.saddr) { - qapi_free_SocketAddress(outgoing_args.saddr); - outgoing_args.saddr = NULL; - } - return 0; -} - struct SocketConnectData { MigrationState *s; char *hostname; @@ -137,6 +126,14 @@ void socket_start_outgoing_migration(MigrationState *s, NULL); } +void socket_cleanup_outgoing_migration(void) +{ + if (outgoing_args.saddr) { + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = NULL; + } +} + static void socket_accept_incoming_migration(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) diff --git a/migration/socket.h b/migration/socket.h index 5e4c33b8ea541120912310944e6689f92d47a881..46c233ecd29e5e4977c978abbaf7ec553a9c8576 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -23,10 +23,11 @@ void socket_send_channel_create(QIOTaskFunc f, void *data); QIOChannel *socket_send_channel_create_sync(Error **errp); -int socket_send_channel_destroy(QIOChannel *send); void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); void socket_start_outgoing_migration(MigrationState *s, SocketAddress *saddr, Error **errp); +void socket_cleanup_outgoing_migration(void); + #endif diff --git a/migration/trace-events b/migration/trace-events index de4a743c8a77ccc7b28160c50e714ebae66566bf..f0e1cb80c75bbcb71b443df782058cddb5b895c0 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" -multifd_send_terminate_threads(bool error) "error %d" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_terminate_threads(void) "" +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" diff --git a/migration/vmstate.c b/migration/vmstate.c index b7723a4187144d4e978713a8994ee200c1249afb..e621d8ddb7159d484e54dc00f1d3616090c42f6e 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -20,9 +20,11 @@ #include "qemu/bitops.h" #include "qemu/error-report.h" #include "trace.h" +#include "exec/memory.h" static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc); + void *opaque, JSONWriter *vmdesc, + Error **errp); static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, void *opaque); @@ -183,6 +185,13 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, return ret; } if (vmsd->post_load) { + /** + * We call memory_transaction_begin in qemu_loadvm_state_main, + * so address space will not be updated during vm state loading. + * But some dev need to use address space here, force commit + * memory region transaction before call post_load. + */ + memory_region_commit(); ret = vmsd->post_load(opaque, version_id); } trace_vmstate_load_state_end(vmsd->name, "end", ret); @@ -440,12 +449,13 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, json_writer_end_array(vmdesc); } - ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc); + ret = vmstate_subsection_save(f, vmsd, opaque, vmdesc, errp); if (vmsd->post_save) { int ps_ret = vmsd->post_save(opaque); - if (!ret) { + if (!ret && ps_ret) { ret = ps_ret; + error_setg(errp, "post-save failed: %s", vmsd->name); } } return ret; @@ -515,7 +525,8 @@ static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, } static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, JSONWriter *vmdesc) + void *opaque, JSONWriter *vmdesc, + Error **errp) { const VMStateDescription **sub = vmsd->subsections; bool vmdesc_has_subsections = false; @@ -543,7 +554,7 @@ static int vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)vmsdsub->name, len); qemu_put_be32(f, vmsdsub->version_id); - ret = vmstate_save_state(f, vmsdsub, opaque, vmdesc); + ret = vmstate_save_state_with_err(f, vmsdsub, opaque, vmdesc, errp); if (ret) { return ret; } diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 871898ac46b77dab291bd59d155703337376b3e6..5bb3c9cd461a17a5bae1f1009c919230beda4310 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -24,6 +24,7 @@ #include "qapi/qapi-commands-control.h" #include "qapi/qapi-commands-misc.h" #include "qapi/qmp/qdict.h" +#include "qapi/qapi-visit-migration.h" #include "qemu/cutils.h" #include "hw/intc/intc.h" #include "qemu/log.h" diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h index 252de856812f41eb45a88866240950861b943822..d7842fa464fc244a6e8586dd6f42f90e1b953b60 100644 --- a/monitor/monitor-internal.h +++ b/monitor/monitor-internal.h @@ -144,6 +144,7 @@ typedef struct { const QmpCommandList *commands; bool capab_offered[QMP_CAPABILITY__MAX]; /* capabilities offered */ bool capab[QMP_CAPABILITY__MAX]; /* offered and accepted */ + uint64_t qmp_client_id; /*qmp client id, update if peer disconnect */ /* * Protects qmp request/response queue. * Take monitor_lock first when you need both. diff --git a/monitor/monitor.c b/monitor/monitor.c index 01ede1babd3d5ce004f528d870d54b9a0a1279c1..8d59a7661204787922232b6eded826cd10f2f5fe 100644 --- a/monitor/monitor.c +++ b/monitor/monitor.c @@ -29,10 +29,13 @@ #include "qapi/qapi-emit-events.h" #include "qapi/qapi-visit-control.h" #include "qapi/qmp/qdict.h" +#include "qapi/qmp/qjson.h" #include "qemu/error-report.h" #include "qemu/option.h" #include "sysemu/qtest.h" #include "trace.h" +#include "qemu/log.h" +#include "qapi/qmp/qobject.h" /* * To prevent flooding clients, events can be throttled. The @@ -338,6 +341,7 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) { Monitor *mon; MonitorQMP *qmp_mon; + GString *json; trace_monitor_protocol_event_emit(event, qdict); QTAILQ_FOREACH(mon, &mon_list, entry) { @@ -348,6 +352,13 @@ static void monitor_qapi_event_emit(QAPIEvent event, QDict *qdict) qmp_mon = container_of(mon, MonitorQMP, common); if (qmp_mon->commands != &qmp_cap_negotiation_commands) { qmp_send_response(qmp_mon, qdict); + json = qobject_to_json(QOBJECT(qdict)); + if (json) { + if (!strstr(json->str, "RTC_CHANGE")) { + qemu_log("%s\n", json->str); + } + g_string_free(json, true); + } } } } @@ -778,6 +789,33 @@ int monitor_init_opts(QemuOpts *opts, Error **errp) return ret; } +void monitor_qapi_event_discard_io_error(void) +{ + GHashTableIter event_iter; + MonitorQAPIEventState *evstate; + gpointer key, value; + GString *json; + + qemu_mutex_lock(&monitor_lock); + g_hash_table_iter_init(&event_iter, monitor_qapi_event_state); + while (g_hash_table_iter_next(&event_iter, &key, &value)) { + evstate = key; + /* Only QAPI_EVENT_BLOCK_IO_ERROR is discarded */ + if (evstate->event == QAPI_EVENT_BLOCK_IO_ERROR) { + g_hash_table_iter_remove(&event_iter); + json = qobject_to_json(QOBJECT(evstate->qdict)); + qemu_log(" %s event discarded\n", json->str); + timer_del(evstate->timer); + timer_free(evstate->timer); + qobject_unref(evstate->data); + qobject_unref(evstate->qdict); + g_string_free(json, true); + g_free(evstate); + } + } + qemu_mutex_unlock(&monitor_lock); +} + QemuOptsList qemu_mon_opts = { .name = "mon", .implied_opt_name = "chardev", diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c index b0f948d33766bb781b5bb1cfdfacb3d190828b9a..c0b66f11bf88ab61a8a195ead686acb94a9e1548 100644 --- a/monitor/qmp-cmds.c +++ b/monitor/qmp-cmds.c @@ -23,6 +23,7 @@ #include "sysemu/runstate.h" #include "sysemu/runstate-action.h" #include "sysemu/block-backend.h" +#include "sysemu/kvm.h" #include "qapi/error.h" #include "qapi/qapi-init-commands.h" #include "qapi/qapi-commands-control.h" @@ -32,6 +33,7 @@ #include "hw/mem/memory-device.h" #include "hw/intc/intc.h" #include "hw/rdma/rdma.h" +#include "qemu/log.h" NameInfo *qmp_query_name(Error **errp) { @@ -49,6 +51,11 @@ void qmp_quit(Error **errp) void qmp_stop(Error **errp) { + if (virtcca_cvm_enabled()) { + error_setg(errp, "The stop command is temporarily unsupported in cvm."); + return; + } + /* if there is a dump in background, we should wait until the dump * finished */ if (qemu_system_dump_in_progress()) { @@ -110,8 +117,10 @@ void qmp_cont(Error **errp) } if (runstate_check(RUN_STATE_INMIGRATE)) { + qemu_log("qmp cont is received in migration\n"); autostart = 1; } else { + qemu_log("qmp cont is received and vm is started\n"); vm_start(); } } diff --git a/monitor/qmp.c b/monitor/qmp.c index 6eee450fe40c120e9b39105a780247ae495f84c7..8f7671c5f12513a3c81ecc60ff86ada45f08a066 100644 --- a/monitor/qmp.c +++ b/monitor/qmp.c @@ -149,18 +149,19 @@ void qmp_send_response(MonitorQMP *mon, const QDict *rsp) * Null @rsp can only happen for commands with QCO_NO_SUCCESS_RESP. * Nothing is emitted then. */ -static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp) +static void monitor_qmp_respond(MonitorQMP *mon, QDict *rsp, uint64_t req_client_id) { - if (rsp) { - qmp_send_response(mon, rsp); + if (!rsp || (mon->qmp_client_id != req_client_id)) { + return; } + qmp_send_response(mon, rsp); } /* * Runs outside of coroutine context for OOB commands, but in * coroutine context for everything else. */ -static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req) +static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req, uint64_t req_client_id) { QDict *rsp; QDict *error; @@ -180,7 +181,7 @@ static void monitor_qmp_dispatch(MonitorQMP *mon, QObject *req) } } - monitor_qmp_respond(mon, rsp); + monitor_qmp_respond(mon, rsp, req_client_id); qobject_unref(rsp); } @@ -340,13 +341,13 @@ void coroutine_fn monitor_qmp_dispatcher_co(void *data) trace_monitor_qmp_cmd_in_band(id_json->str); g_string_free(id_json, true); } - monitor_qmp_dispatch(mon, req_obj->req); + monitor_qmp_dispatch(mon, req_obj->req, mon->qmp_client_id); } else { assert(req_obj->err); trace_monitor_qmp_err_in_band(error_get_pretty(req_obj->err)); rsp = qmp_error_response(req_obj->err); req_obj->err = NULL; - monitor_qmp_respond(mon, rsp); + monitor_qmp_respond(mon, rsp, mon->qmp_client_id); qobject_unref(rsp); } @@ -402,7 +403,7 @@ static void handle_qmp_command(void *opaque, QObject *req, Error *err) trace_monitor_qmp_cmd_out_of_band(id_json->str); g_string_free(id_json, true); } - monitor_qmp_dispatch(mon, req); + monitor_qmp_dispatch(mon, req, mon->qmp_client_id); qobject_unref(req); return; } @@ -486,6 +487,7 @@ static void monitor_qmp_event(void *opaque, QEMUChrEvent event) mon_refcount++; break; case CHR_EVENT_CLOSED: + mon->qmp_client_id++; /* * Note: this is only useful when the output of the chardev * backend is still open. For example, when the backend is @@ -539,6 +541,7 @@ void monitor_init_qmp(Chardev *chr, bool pretty, Error **errp) } qemu_chr_fe_set_echo(&mon->common.chr, true); + mon->qmp_client_id = 1; /* Note: we run QMP monitor in I/O thread when @chr supports that */ monitor_data_init(&mon->common, true, false, qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_GCONTEXT)); diff --git a/nbd/client.c b/nbd/client.c index 29ffc609a4b7dd0b01e758ff02839f0e1a651421..987dde43c7d55fd2997dd7cdf3f42e1a2546eff1 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -24,6 +24,8 @@ #include "nbd-internal.h" #include "qemu/cutils.h" +#define NBD_TIMEOUT_SECONDS 30 + /* Definitions for opaque data types */ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); @@ -1310,6 +1312,12 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info, } } + if (ioctl(fd, NBD_SET_TIMEOUT, NBD_TIMEOUT_SECONDS) < 0) { + int serrno = errno; + error_setg(errp, "Failed setting timeout"); + return -serrno; + } + trace_nbd_init_finish(); return 0; diff --git a/nbd/server.c b/nbd/server.c index 895cf0a7525bdf85996564faa05d515042c7a45e..d1b3c35b59dd8a40d050dacd353fa7c22eec923a 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -124,10 +124,12 @@ struct NBDMetaContexts { struct NBDClient { int refcount; void (*close_fn)(NBDClient *client, bool negotiated); + void *owner; NBDExport *exp; QCryptoTLSCreds *tlscreds; char *tlsauthz; + uint32_t handshake_max_secs; QIOChannelSocket *sioc; /* The underlying data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ @@ -1865,7 +1867,7 @@ static void nbd_export_request_shutdown(BlockExport *blk_exp) blk_exp_ref(&exp->common); /* - * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a + * TODO: Should we expand QMP BlockExportRemoveMode enum to allow a * close mode that stops advertising the export to new clients but * still permits existing clients to run to completion? Because of * that possibility, nbd_export_close() can be called more than @@ -2939,6 +2941,7 @@ static coroutine_fn void nbd_trip(void *opaque) NBDRequestData *req; NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */ int ret; + bool client_closing; Error *local_err = NULL; trace_nbd_trip(); @@ -3023,8 +3026,11 @@ disconnect: if (local_err) { error_reportf_err(local_err, "Disconnect client, due to: "); } + client_closing = client->closing; nbd_request_put(req); - client_close(client, true); + if (!client_closing) { + client_close(client, true); + } nbd_client_put(client); } @@ -3038,33 +3044,63 @@ static void nbd_client_receive_next_request(NBDClient *client) } } +static void nbd_handshake_timer_cb(void *opaque) +{ + QIOChannel *ioc = opaque; + + trace_nbd_handshake_timer_cb(); + qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); +} + static coroutine_fn void nbd_co_client_start(void *opaque) { NBDClient *client = opaque; Error *local_err = NULL; + QEMUTimer *handshake_timer = NULL; qemu_co_mutex_init(&client->send_lock); + /* + * Create a timer to bound the time spent in negotiation. If the + * timer expires, it is likely nbd_negotiate will fail because the + * socket was shutdown. + */ + if (client->handshake_max_secs > 0) { + handshake_timer = aio_timer_new(qemu_get_aio_context(), + QEMU_CLOCK_REALTIME, + SCALE_NS, + nbd_handshake_timer_cb, + client->sioc); + timer_mod(handshake_timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + + client->handshake_max_secs * NANOSECONDS_PER_SECOND); + } + if (nbd_negotiate(client, &local_err)) { if (local_err) { error_report_err(local_err); } + timer_free(handshake_timer); client_close(client, false); return; } + timer_free(handshake_timer); nbd_client_receive_next_request(client); } /* - * Create a new client listener using the given channel @sioc. + * Create a new client listener using the given channel @sioc and @owner. * Begin servicing it in a coroutine. When the connection closes, call - * @close_fn with an indication of whether the client completed negotiation. + * @close_fn with an indication of whether the client completed negotiation + * within @handshake_max_secs seconds (0 for unbounded). */ void nbd_client_new(QIOChannelSocket *sioc, + uint32_t handshake_max_secs, QCryptoTLSCreds *tlscreds, const char *tlsauthz, - void (*close_fn)(NBDClient *, bool)) + void (*close_fn)(NBDClient *, bool), + void *owner) { NBDClient *client; Coroutine *co; @@ -3076,13 +3112,21 @@ void nbd_client_new(QIOChannelSocket *sioc, object_ref(OBJECT(client->tlscreds)); } client->tlsauthz = g_strdup(tlsauthz); + client->handshake_max_secs = handshake_max_secs; client->sioc = sioc; qio_channel_set_delay(QIO_CHANNEL(sioc), false); object_ref(OBJECT(client->sioc)); client->ioc = QIO_CHANNEL(sioc); object_ref(OBJECT(client->ioc)); client->close_fn = close_fn; + client->owner = owner; co = qemu_coroutine_create(nbd_co_client_start, client); qemu_coroutine_enter(co); } + +void * +nbd_client_owner(NBDClient *client) +{ + return client->owner; +} diff --git a/nbd/trace-events b/nbd/trace-events index 00ae3216a11148a39cfd8b7074bc01e6d5212fc9..cbd0a4ab7e4b9c7de605544ecc20204572e836f6 100644 --- a/nbd/trace-events +++ b/nbd/trace-events @@ -76,6 +76,7 @@ nbd_co_receive_request_payload_received(uint64_t cookie, uint64_t len) "Payload nbd_co_receive_ext_payload_compliance(uint64_t from, uint64_t len) "client sent non-compliant write without payload flag: from=0x%" PRIx64 ", len=0x%" PRIx64 nbd_co_receive_align_compliance(const char *op, uint64_t from, uint64_t len, uint32_t align) "client sent non-compliant unaligned %s request: from=0x%" PRIx64 ", len=0x%" PRIx64 ", align=0x%" PRIx32 nbd_trip(void) "Reading request" +nbd_handshake_timer_cb(void) "client took too long to negotiate" # client-connection.c nbd_connect_thread_sleep(uint64_t timeout) "timeout %" PRIu64 diff --git a/net/colo-compare.c b/net/colo-compare.c index 7f9e6f89ce058fdaa89b0084cf350298ed20b8ab..d4e51cb306e018123bcdf3250ad1e9a62820d192 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -413,8 +413,7 @@ static void colo_compare_tcp(CompareState *s, Connection *conn) * can ensure that the packet's payload is acknowledged by * primary and secondary. */ - uint32_t min_ack = conn->pack - conn->sack > 0 ? - conn->sack : conn->pack; + uint32_t min_ack = MIN(conn->pack, conn->sack); pri: if (g_queue_is_empty(&conn->primary_list)) { diff --git a/net/dump.c b/net/dump.c index 16073f245828f9c8083be6690e759cdcef110746..d880a7e299c550974e1d7066999dd6cde692b381 100644 --- a/net/dump.c +++ b/net/dump.c @@ -87,7 +87,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt, dumpiov[0].iov_len = sizeof(hdr); cnt = iov_copy(&dumpiov[1], cnt, iov, cnt, offset, caplen); - if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) { + if (writev(s->fd, &dumpiov[0], cnt + 1) != sizeof(hdr) + caplen) { error_report("network dump write error - stopping dump"); close(s->fd); s->fd = -1; diff --git a/net/meson.build b/net/meson.build index ce99bd4447f484e8842d7a5c47403217343dd0b5..7264479242c21737e86fbc55bc244c3da3e417a1 100644 --- a/net/meson.build +++ b/net/meson.build @@ -37,7 +37,7 @@ if have_netmap system_ss.add(files('netmap.c')) endif -system_ss.add(when: libxdp, if_true: files('af-xdp.c')) +system_ss.add(when: [libxdp, libbpf], if_true: files('af-xdp.c')) if have_vhost_net_user system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c')) diff --git a/net/net.c b/net/net.c index 0520bc1681a469545cfb56160c70d513384f964b..bcd3d7e04cce5b4a7c00345ffdd22b48ffd1f525 100644 --- a/net/net.c +++ b/net/net.c @@ -1322,6 +1322,12 @@ void qmp_netdev_del(const char *id, Error **errp) return; } + if (nc->info->type == NET_CLIENT_DRIVER_VHOST_USER && nc->peer) { + error_setg(errp, "Device '%s' is a netdev for vhostuser," + "please delete the peer front-end device (virtio-net) first.", id); + return; + } + qemu_del_net_client(nc); /* diff --git a/net/vhost-user.c b/net/vhost-user.c index 12555518e83887be0a6fbf133bf354f7f57d8b05..86fd5056ab6ce3082678fc0b8e53379b440818a5 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -20,6 +20,9 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include "trace.h" +#include "include/hw/virtio/vhost.h" + +#define VHOST_USER_RECONNECT_TIME (3) typedef struct NetVhostUserState { NetClientState nc; @@ -292,6 +295,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event) trace_vhost_user_event(chr->label, event); switch (event) { case CHR_EVENT_OPENED: + qemu_chr_set_reconnect_time(chr, VHOST_USER_RECONNECT_TIME); if (vhost_user_start(queues, ncs, s->vhost_user) < 0) { qemu_chr_fe_disconnect(&s->chr); return; @@ -370,6 +374,10 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, NULL, nc0->name, NULL, true); + if (used_memslots_is_exceeded()) { + error_report("used memslots exceeded the backend limit, quit loop"); + goto err; + } } while (!s->started); assert(s->vhost_net); diff --git a/os-posix.c b/os-posix.c index 52ef6990ff9d4c4e2cd52b6f6a820cdbb8efb9fb..8f70ee0534a920ab0761f7abfaaeb29c9b028393 100644 --- a/os-posix.c +++ b/os-posix.c @@ -306,6 +306,7 @@ int os_mlock(void) #ifdef HAVE_MLOCKALL int ret = 0; + qemu_log("do mlockall\n"); ret = mlockall(MCL_CURRENT | MCL_FUTURE); if (ret < 0) { error_report("mlockall: %s", strerror(errno)); diff --git a/qapi/block-core.json b/qapi/block-core.json index ca390c570022a82355d50d295120347a9e5ed2b4..0fa184698a1f2ef08448d9bd1c615233e45176fc 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1286,10 +1286,12 @@ # # @auto: inherit the error handling policy of the backend (since: 2.7) # +# @retry: retrying IO with errors +# # Since: 1.3 ## { 'enum': 'BlockdevOnError', - 'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] } + 'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'retry'] } ## # @MirrorSyncMode: @@ -1660,8 +1662,6 @@ # # Takes a synchronous snapshot of a block device. # -# For the arguments, see the documentation of BlockdevSnapshotSync. -# # Returns: # - nothing on success # - If @device is not a valid block device, DeviceNotFound @@ -1691,8 +1691,6 @@ # device, the block device changes to using 'overlay' as its new # active image. # -# For the arguments, see the documentation of BlockdevSnapshot. -# # Features: # # @allow-write-only-overlay: If present, the check whether this @@ -4906,6 +4904,8 @@ # # @extent-size-hint: Extent size hint to add to the image file; 0 for # not adding an extent size hint (default: 1 MB, since 5.1) +# @cache: Cache mode used to write the output disk image +# @buffersize: Buffer size for creating image # # Since: 2.12 ## @@ -4914,7 +4914,9 @@ 'size': 'size', '*preallocation': 'PreallocMode', '*nocow': 'bool', - '*extent-size-hint': 'size'} } + '*extent-size-hint': 'size', + '*cache': 'str', + '*buffersize': 'size'} } ## # @BlockdevCreateOptionsGluster: @@ -5476,10 +5478,12 @@ # # @stop: error caused VM to be stopped # +# @retry: retry IO with errors +# # Since: 2.1 ## { 'enum': 'BlockErrorAction', - 'data': [ 'ignore', 'report', 'stop' ] } + 'data': [ 'ignore', 'report', 'stop', 'retry' ] } ## # @BLOCK_IMAGE_CORRUPTED: @@ -6029,9 +6033,6 @@ # string, or a snapshot with name already exists, the operation will # fail. # -# For the arguments, see the documentation of -# BlockdevSnapshotInternal. -# # Returns: # - nothing on success # - If @device is not a valid block device, GenericError diff --git a/qapi/block-export.json b/qapi/block-export.json index 7874a49ba71320a849052ffd8ea335722fff83fa..1d255d77e329fd0b15cd5f0642167227eb34c280 100644 --- a/qapi/block-export.json +++ b/qapi/block-export.json @@ -28,7 +28,7 @@ # @max-connections: The maximum number of connections to allow at the # same time, 0 for unlimited. Setting this to 1 also stops the # server from advertising multiple client support (since 5.2; -# default: 0) +# default: 100) # # Since: 4.2 ## @@ -63,7 +63,7 @@ # @max-connections: The maximum number of connections to allow at the # same time, 0 for unlimited. Setting this to 1 also stops the # server from advertising multiple client support (since 5.2; -# default: 0). +# default: 100). # # Returns: error if the server is already running. # diff --git a/qapi/crypto.json b/qapi/crypto.json index fd3d46ebd12821fbed62196e76b87ef25be6802e..af38f0a4bd9e021a382a2a247c887f4df2596cf9 100644 --- a/qapi/crypto.json +++ b/qapi/crypto.json @@ -58,11 +58,13 @@ # # @ripemd160: RIPEMD-160. (since 2.7) # +# @sm3: SM3. (since 8.2.0) +# # Since: 2.6 ## { 'enum': 'QCryptoHashAlgorithm', 'prefix': 'QCRYPTO_HASH_ALG', - 'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160']} + 'data': ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'ripemd160', 'sm3']} ## # @QCryptoCipherAlgorithm: @@ -94,6 +96,8 @@ # # @twofish-256: Twofish with 256 bit / 32 byte keys # +# @sm4: SM4 with 128 bit / 16 byte keys (since 9.0) +# # Since: 2.6 ## { 'enum': 'QCryptoCipherAlgorithm', @@ -102,7 +106,8 @@ 'des', '3des', 'cast5-128', 'serpent-128', 'serpent-192', 'serpent-256', - 'twofish-128', 'twofish-192', 'twofish-256']} + 'twofish-128', 'twofish-192', 'twofish-256', + 'sm4']} ## # @QCryptoCipherMode: diff --git a/qapi/migration.json b/qapi/migration.json index eb2f8835133480e8439716303a1b97a87a1f67c8..37e1d4857e5cef6b44d9eeea54db56daf97e9015 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -625,11 +625,23 @@ # # @zstd: use zstd compression method. # +# @qatzip: use qatzip compression method. (Since 9.2) +# +# @qpl: use qpl compression method. Query Processing Library(qpl) is +# based on the deflate compression algorithm and use the Intel +# In-Memory Analytics Accelerator(IAA) accelerated compression +# and decompression. (Since 9.1) +# +# @uadk: use UADK library compression method. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', - { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } + { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'}, + { 'name': 'qpl', 'if': 'CONFIG_QPL' }, + { 'name': 'uadk', 'if': 'CONFIG_UADK' } ] } ## # @MigMode: @@ -653,6 +665,23 @@ { 'enum': 'MigMode', 'data': [ 'normal', 'cpr-reboot' ] } +## +# @ZeroPageDetection: +# +# @none: Do not perform zero page checking. +# +# @legacy: Perform zero page checking in main migration thread. +# +# @multifd: Perform zero page checking in multifd sender thread if +# multifd migration is enabled, else in the main migration +# thread as for @legacy. +# +# Since: 9.0 +# +## +{ 'enum': 'ZeroPageDetection', + 'data': [ 'none', 'legacy', 'multifd' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -708,6 +737,20 @@ 'bitmaps': [ 'BitmapMigrationBitmapAlias' ] } } +## +# @CompressMethod: +# +# An enumeration of multi-thread compression methods. +# +# @zlib: use zlib compression method. +# @zstd: use zstd compression method. +# +# Since: 5.0 +# +## +{ 'enum': 'CompressMethod', + 'data': [ 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } + ## # @MigrationParameter: # @@ -746,6 +789,9 @@ # fast as compression, so set the decompress-threads to the number # about 1/4 of compress-threads is adequate. # +# @compress-method: Which multi-thread compression method to use. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -842,6 +888,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -874,6 +925,24 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64, or vendor cert filename for hygon (Since 4.2) +# +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -892,6 +961,7 @@ { 'name': 'compress-level', 'features': [ 'deprecated' ] }, { 'name': 'compress-threads', 'features': [ 'deprecated' ] }, { 'name': 'decompress-threads', 'features': [ 'deprecated' ] }, + { 'name': 'compress-method', 'features': [ 'deprecated' ] }, { 'name': 'compress-wait-thread', 'features': [ 'deprecated' ] }, 'throttle-trigger-threshold', 'cpu-throttle-initial', 'cpu-throttle-increment', @@ -904,10 +974,13 @@ 'xbzrle-cache-size', 'max-postcopy-bandwidth', 'max-cpu-throttle', 'multifd-compression', 'multifd-zlib-level', 'multifd-zstd-level', + 'multifd-qatzip-level', 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', - 'mode'] } + 'mode', + 'zero-page-detection', + 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert', 'hdbss-buffer-size'] } ## # @MigrateSetParameters: @@ -935,6 +1008,9 @@ # # @decompress-threads: decompression thread count # +# @compress-method: Set compression method to use in multi-thread compression. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -1030,6 +1106,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1062,12 +1143,31 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64, or vendor cert filename for hygon (Since 4.2) +# +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use # blockdev-mirror with NBD instead. Members @compress-level, -# @compress-threads, @decompress-threads and @compress-wait-thread -# are deprecated because @compression is deprecated. +# @compress-threads, @decompress-threads, @compress-method +# and @compress-wait-thread are deprecated because +# @compression is deprecated. # # @unstable: Members @x-checkpoint-delay and @x-vcpu-dirty-limit-period # are experimental. @@ -1090,6 +1190,8 @@ 'features': [ 'deprecated' ] }, '*decompress-threads': { 'type': 'uint8', 'features': [ 'deprecated' ] }, + '*compress-method': { 'type': 'CompressMethod', + 'features': [ 'deprecated' ] }, '*throttle-trigger-threshold': 'uint8', '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', @@ -1110,12 +1212,18 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', + '*sev-pdh': 'StrOrNull', + '*sev-plat-cert': 'StrOrNull', + '*sev-amd-cert' : 'StrOrNull', + '*hdbss-buffer-size': 'uint8'} } ## # @migrate-set-parameters: @@ -1161,6 +1269,9 @@ # # @decompress-threads: decompression thread count # +# @compress-method: Which multi-thread compression method to use. +# Defaults to none. (Since 5.0) +# # @throttle-trigger-threshold: The ratio of bytes_dirty_period and # bytes_xfer_period to trigger throttling. It is expressed as # percentage. The default value is 50. (Since 5.0) @@ -1258,6 +1369,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1290,6 +1406,24 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# +# @sev-pdh: The target host platform diffie-hellman key encoded in base64, or +# pdh filename for hygon +# (Since 4.2) +# +# @sev-plat-cert: The target host platform certificate chain encoded in base64, +# or plat cert filename for hygon +# (Since 4.2) +# +# @sev-amd-cert: AMD certificate chain which include ASK and OCA encoded in +# base64, or vendor cert filename for hygon (Since 4.2) +# +# @hdbss-buffer-size: Size of the HDBSS(Hardware Dirty state tracking Structure). +# Defaults to 0. (Since 8.6) +# # Features: # # @deprecated: Member @block-incremental is deprecated. Use @@ -1315,6 +1449,8 @@ 'features': [ 'deprecated' ] }, '*decompress-threads': { 'type': 'uint8', 'features': [ 'deprecated' ] }, + '*compress-method': { 'type': 'CompressMethod', + 'features': [ 'deprecated' ] }, '*throttle-trigger-threshold': 'uint8', '*cpu-throttle-initial': 'uint8', '*cpu-throttle-increment': 'uint8', @@ -1335,12 +1471,18 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', - '*mode': 'MigMode'} } + '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', + '*sev-pdh': 'str', + '*sev-plat-cert': 'str', + '*sev-amd-cert' : 'str', + '*hdbss-buffer-size': 'uint8'} } ## # @query-migrate-parameters: @@ -1418,6 +1560,30 @@ { 'event': 'MIGRATION_PASS', 'data': { 'pass': 'int' } } +## +# @MIGRATION_MULTIFD_PID: +# +# Emitted when multifd thread appear +# +# @pid: pid of multifd thread +# +# Since: EulerOS Virtual +## +{ 'event': 'MIGRATION_MULTIFD_PID', + 'data': { 'pid': 'int' } } + +## +# @MIGRATION_PID: +# +# Emitted when migration thread appear +# +# @pid: pid of migration thread +# +# Since: EulerOS Virtual +## +{ 'event': 'MIGRATION_PID', + 'data': { 'pid': 'int' } } + ## # @COLOMessage: # diff --git a/qapi/misc-target.json b/qapi/misc-target.json index 88291453ba476a26a4b5583f79a553857aa061f5..3df0062f3dc872bd50c0e014b243f6e3fb088322 100644 --- a/qapi/misc-target.json +++ b/qapi/misc-target.json @@ -487,3 +487,34 @@ { 'command': 'xen-event-inject', 'data': { 'port': 'uint32' }, 'if': 'TARGET_I386' } + +## +# @VirtccaCapability: +# +# The struct describes capability for VIRTCCA feature. +# +# Since: 8.2.0 +## +{ 'struct': 'VirtccaCapability', + 'data': { 'enabled': 'bool' }, + 'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] } +} + +## +# @query-virtcca-capabilities: +# +# This command is used to get the VIRTCCA capabilities, and is supported +# on HISI AARCH64 platforms only. +# +# Returns: VirtccaCapability objects. +# +# Since: 8.2.0 +# +# Example: +# +# -> { "execute": "query-virtcca-capabilities" } +# <- { "return": { "enabled": true } } +## +{ 'command': 'query-virtcca-capabilities', 'returns': 'VirtccaCapability', + 'if': { 'all': ['TARGET_AARCH64' , 'CONFIG_KVM'] } +} \ No newline at end of file diff --git a/qapi/misc.json b/qapi/misc.json index cda2effa8155a8a6175036f678877ec2c7cdd1bf..1832d5f4608b86bd9d1e9a85fd291a386b6a4d15 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -550,6 +550,15 @@ 'returns': ['CommandLineOptionInfo'], 'allow-preconfig': true} +## +# @query-rtc-date-diff: +# +# get vm's time offset +# +# Since: 2.8 +## +{ 'command': 'query-rtc-date-diff', 'returns': 'int64' } + ## # @RTC_CHANGE: # diff --git a/qapi/pragma.json b/qapi/pragma.json index 0aa4eeddd3893bee760fb28ab1d7222dfb3c13ed..7a07b44bb1089d02e56ed185af1368a3bb41fb1b 100644 --- a/qapi/pragma.json +++ b/qapi/pragma.json @@ -30,7 +30,8 @@ 'qom-get', 'query-tpm-models', 'query-tpm-types', - 'ringbuf-read' ], + 'ringbuf-read', + 'query-rtc-date-diff'], # Externally visible types whose member names may use uppercase 'member-name-exceptions': [ # visible in: 'ACPISlotType', # query-acpi-ospm-status diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 555528b6bbd357fbbc9048647decfa70e36038a2..e33efd3740cbc12bdf9036d5eaa0d011d2defb9f 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -24,6 +24,8 @@ #include "qapi/qmp/qbool.h" #include "qemu/coroutine.h" #include "qemu/main-loop.h" +#include "qemu/log.h" +#include "qapi/qmp/qstring.h" Visitor *qobject_input_visitor_new_qmp(QObject *obj) { @@ -146,6 +148,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ QObject *id; QObject *ret = NULL; QDict *rsp = NULL; + GString *json; dict = qobject_to(QDict, request); if (!dict) { @@ -203,8 +206,35 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ qobject_ref(args); } + json = qobject_to_json(QOBJECT(args)); + if (json) { + if ((strcmp(command, "query-block-jobs") != 0) + && (strcmp(command, "query-migrate") != 0) + && (strcmp(command, "query-blockstats") != 0) + && (strcmp(command, "query-balloon") != 0) + && (strcmp(command, "set_password") != 0)) { + qemu_log("qmp_cmd_name: %s, arguments: %s\n", + command, json->str); + } + g_string_free(json, true); + } + assert(!(oob && qemu_in_coroutine())); assert(monitor_cur() == NULL); + + json = qobject_to_json(QOBJECT(args)); + if (json) { + if ((strcmp(command, "query-block-jobs") != 0) + && (strcmp(command, "query-migrate") != 0) + && (strcmp(command, "query-blockstats") != 0) + && (strcmp(command, "query-balloon") != 0) + && (strcmp(command, "set_password") != 0)) { + qemu_log("qmp_cmd_name: %s, arguments: %s\n", + command, json->str); + } + g_string_free(json, true); + } + if (!!(cmd->options & QCO_COROUTINE) == qemu_in_coroutine()) { monitor_set_cur(qemu_coroutine_self(), cur_mon); cmd->fn(args, &ret, &err); diff --git a/qapi/qom.json b/qapi/qom.json index c53ef978ff7e6a81f8c926159fe52c0d349696ac..a5336e6b11accaf35668f3a7ea4f499486556603 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -794,6 +794,23 @@ { 'struct': 'VfioUserServerProperties', 'data': { 'socket': 'SocketAddress', 'device': 'str' } } +## +# @IOMMUFDProperties: +# +# Properties for iommufd objects. +# +# @fd: file descriptor name previously passed via 'getfd' command, +# which represents a pre-opened /dev/iommu. This allows the +# iommufd object to be shared accross several subsystems +# (VFIO, VDPA, ...), and the file descriptor to be shared +# with other process, e.g. DPDK. (default: QEMU opens +# /dev/iommu by itself) +# +# Since: 9.0 +## +{ 'struct': 'IOMMUFDProperties', + 'data': { '*fd': 'str' } } + ## # @RngProperties: # @@ -866,6 +883,14 @@ # designated guest firmware page for measured boot with -kernel # (default: false) (since 6.2) # +# @user-id: the user id of the guest owner, only support on Hygon CPUs +# (since 8.2) +# +# @secret-header-file: the header file of guest owner's secret, only +# support on Hygon CPUs (since 8.2) +# @secret-file: the file guest owner's secret, only support on Hygon +# CPUs (since 8.2) +# # Since: 2.12 ## { 'struct': 'SevGuestProperties', @@ -876,7 +901,10 @@ '*handle': 'uint32', '*cbitpos': 'uint32', 'reduced-phys-bits': 'uint32', - '*kernel-hashes': 'bool' } } + '*kernel-hashes': 'bool', + '*user-id': 'str', + '*secret-header-file': 'str', + '*secret-file': 'str' } } ## # @ThreadContextProperties: @@ -899,6 +927,30 @@ 'data': { '*cpu-affinity': ['uint16'], '*node-affinity': ['uint16'] } } +## +# @TmmGuestMeasurementAlgo: +# +# Algorithm to use for cvm measurements +# +# Since: FIXME +## +{ 'enum': 'TmmGuestMeasurementAlgo', +'data': ['default', 'sha256', 'sha512'] } + +## +# @TmmGuestProperties: +# +# Properties for tmm-guest objects. +# +# @sve-vector-length: SVE vector length (default: 0, SVE disabled) +# +# Since: FIXME +## +{ 'struct': 'TmmGuestProperties', + 'data': { '*sve-vector-length': 'uint32', + '*num-pmu-counters': 'uint32', + '*kae': 'uint32', + '*measurement-algo': 'TmmGuestMeasurementAlgo' } } ## # @ObjectType: @@ -934,6 +986,7 @@ 'input-barrier', { 'name': 'input-linux', 'if': 'CONFIG_LINUX' }, + 'iommufd', 'iothread', 'main-loop', { 'name': 'memory-backend-epc', @@ -962,7 +1015,8 @@ 'tls-creds-x509', 'tls-cipher-suites', { 'name': 'x-remote-object', 'features': [ 'unstable' ] }, - { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] } + { 'name': 'x-vfio-user-server', 'features': [ 'unstable' ] }, + 'tmm-guest' ] } ## @@ -1003,6 +1057,7 @@ 'input-barrier': 'InputBarrierProperties', 'input-linux': { 'type': 'InputLinuxProperties', 'if': 'CONFIG_LINUX' }, + 'iommufd': 'IOMMUFDProperties', 'iothread': 'IothreadProperties', 'main-loop': 'MainLoopProperties', 'memory-backend-epc': { 'type': 'MemoryBackendEpcProperties', @@ -1029,7 +1084,8 @@ 'tls-creds-x509': 'TlsCredsX509Properties', 'tls-cipher-suites': 'TlsCredsProperties', 'x-remote-object': 'RemoteObjectProperties', - 'x-vfio-user-server': 'VfioUserServerProperties' + 'x-vfio-user-server': 'VfioUserServerProperties', + 'tmm-guest': 'TmmGuestProperties' } } ## diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 068692d13ebd944b78ded2c76b5f1b616b351b4d..20bdcd7b8244a1b55ca30fed49dcb83847775f98 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -52,9 +52,9 @@ SRST ERST DEF("create", img_create, - "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-o options] filename [size]") + "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-t cache] [-o options] filename [size]") SRST -.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-o OPTIONS] FILENAME [SIZE] +.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-t CACHE] [-o OPTIONS] FILENAME [SIZE] ERST DEF("dd", img_dd, diff --git a/qemu-img.c b/qemu-img.c index 5a77f67719316a103255189bea89d3702a0e1eda..49d914c9c4f7ca8fe4a0a0a7bdf4282bc3cabdd8 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -508,6 +508,22 @@ static int64_t cvtnum(const char *name, const char *value) return cvtnum_full(name, value, 0, INT64_MAX); } +static bool is_reg_file(const char *filename) +{ + struct stat st; + + /* file not exist, file will be create later, so it's a reg file */ + if (access(filename, F_OK) == -1) { + return true; + } + + /* file exist, check file type */ + if (stat(filename, &st) >= 0 && S_ISREG(st.st_mode)) { + return true; + } + return false; +} + static int img_create(int argc, char **argv) { int c; @@ -516,6 +532,7 @@ static int img_create(int argc, char **argv) const char *base_fmt = NULL; const char *filename; const char *base_filename = NULL; + const char *cache = BDRV_DEFAULT_CACHE; char *options = NULL; Error *local_err = NULL; bool quiet = false; @@ -527,7 +544,7 @@ static int img_create(int argc, char **argv) {"object", required_argument, 0, OPTION_OBJECT}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, ":F:b:f:ho:qu", + c = getopt_long(argc, argv, ":F:b:f:t:ho:qu", long_options, NULL); if (c == -1) { break; @@ -551,6 +568,9 @@ static int img_create(int argc, char **argv) case 'f': fmt = optarg; break; + case 't': + cache = optarg; + break; case 'o': if (accumulate_options(&options, optarg) < 0) { goto fail; @@ -594,6 +614,16 @@ static int img_create(int argc, char **argv) error_exit("Unexpected argument: %s", argv[optind]); } + if (is_reg_file(filename)) { + if (!options) { + options = g_strdup_printf(BLOCK_OPT_CACHE"=%s", cache); + } else { + char *old_options = options; + options = g_strdup_printf("%s,"BLOCK_OPT_CACHE"=%s", options, cache); + g_free(old_options); + } + } + bdrv_img_create(filename, fmt, base_filename, base_fmt, options, img_size, flags, quiet, &local_err); if (local_err) { diff --git a/qemu-nbd.c b/qemu-nbd.c index 186e6468b1abf2fde0d911bececb1e60d9fd266a..8b09cb5e2a8a904113800d58d1755f4f91e4f1b0 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -389,7 +389,9 @@ static void nbd_accept(QIONetListener *listener, QIOChannelSocket *cioc, nb_fds++; nbd_update_server_watch(); - nbd_client_new(cioc, tlscreds, tlsauthz, nbd_client_closed); + /* TODO - expose handshake timeout as command line option */ + nbd_client_new(cioc, NBD_DEFAULT_HANDSHAKE_MAX_SECS, + tlscreds, tlsauthz, nbd_client_closed, NULL); } static void nbd_update_server_watch(void) @@ -587,7 +589,8 @@ int main(int argc, char **argv) pthread_t client_thread; const char *fmt = NULL; Error *local_err = NULL; - BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; + BlockdevDetectZeroesOptions detect_zeroes = + BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; QDict *options = NULL; const char *export_name = NULL; /* defaults to "" later for server mode */ const char *export_description = NULL; @@ -843,6 +846,10 @@ int main(int argc, char **argv) trace_init_file(); qemu_set_log(LOG_TRACE, &error_fatal); + if (!seen_aio && (flags & BDRV_O_NOCACHE)) { + flags |= BDRV_O_NATIVE_AIO; + } + socket_activation = check_socket_activation(); if (socket_activation == 0) { if (!sockpath) { diff --git a/qemu-options.hx b/qemu-options.hx index 42fd09e4de96e962cd5873c49501f6e1dbb5e346..b09d692d5b4b9d51a3af27bef86aefac8550d24c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -149,14 +149,14 @@ SRST platform and configuration dependent. ``interleave-granularity=granularity`` sets the granularity of - interleave. Default 256KiB. Only 256KiB, 512KiB, 1024KiB, 2048KiB - 4096KiB, 8192KiB and 16384KiB granularities supported. + interleave. Default 256 (bytes). Only 256, 512, 1k, 2k, + 4k, 8k and 16k granularities supported. Example: :: - -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512k + -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512 ERST DEF("M", HAS_ARG, QEMU_OPTION_M, @@ -2679,7 +2679,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, " specify SMBIOS type 3 fields\n" "-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str]\n" " [,asset=str][,part=str][,max-speed=%d][,current-speed=%d]\n" - " [,processor-id=%d]\n" + " [,processor-family=%d][,processor-id=%d]\n" " specify SMBIOS type 4 fields\n" "-smbios type=8[,external_reference=str][,internal_reference=str][,connector_type=%d][,port_type=%d]\n" " specify SMBIOS type 8 fields\n" @@ -2690,7 +2690,7 @@ DEF("smbios", HAS_ARG, QEMU_OPTION_smbios, " specify SMBIOS type 17 fields\n" "-smbios type=41[,designation=str][,kind=str][,instance=%d][,pcidev=str]\n" " specify SMBIOS type 41 fields\n", - QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH) + QEMU_ARCH_I386 | QEMU_ARCH_ARM | QEMU_ARCH_LOONGARCH | QEMU_ARCH_RISCV) SRST ``-smbios file=binary`` Load SMBIOS entry from binary file. @@ -2707,7 +2707,7 @@ SRST ``-smbios type=3[,manufacturer=str][,version=str][,serial=str][,asset=str][,sku=str]`` Specify SMBIOS type 3 fields -``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-id=%d]`` +``-smbios type=4[,sock_pfx=str][,manufacturer=str][,version=str][,serial=str][,asset=str][,part=str][,processor-family=%d][,processor-id=%d]`` Specify SMBIOS type 4 fields ``-smbios type=11[,value=str][,path=filename]`` @@ -5224,6 +5224,18 @@ SRST The ``share`` boolean option is on by default with memfd. + ``-object iommufd,id=id[,fd=fd]`` + Creates an iommufd backend which allows control of DMA mapping + through the ``/dev/iommu`` device. + + The ``id`` parameter is a unique ID which frontends (such as + vfio-pci of vdpa) will use to connect with the iommufd backend. + + The ``fd`` parameter is an optional pre-opened file descriptor + resulting from ``/dev/iommu`` opening. Usually the iommufd is shared + across all subsystems, bringing the benefit of centralized + reference counting. + ``-object rng-builtin,id=id`` Creates a random number generator backend which obtains entropy from QEMU builtin functions. The ``id`` parameter is a unique ID @@ -5637,7 +5649,7 @@ SRST -object secret,id=sec0,keyid=secmaster0,format=base64,\\ data=$SECRET,iv=$(= 0) { ga_unset_frozen(ga_state); + slog("guest-fsthaw called"); execute_fsfreeze_hook(FSFREEZE_HOOK_THAW, errp); } else { ret = 0; diff --git a/qga/commands-win32.c b/qga/commands-win32.c index 697c65507caa06889e4aae01cebe9f87e35eb4b0..656d1459f14858ee8aa04a3bfb6d27261e20a853 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1275,6 +1275,9 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) qga_vss_fsfreeze(&i, false, NULL, errp); ga_unset_frozen(ga_state); + + slog("guest-fsthaw called"); + return i; } diff --git a/qga/main.c b/qga/main.c index 8668b9f3d39ea224b61fd6201b0b19e258b6c2c7..8d341ffdf1e1d45016c4f1ccd4534cd2a235382b 100644 --- a/qga/main.c +++ b/qga/main.c @@ -1399,7 +1399,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) if (g_mkdir_with_parents(config->state_dir, S_IRWXU) == -1) { g_critical("unable to create (an ancestor of) the state directory" " '%s': %s", config->state_dir, strerror(errno)); - return NULL; + goto failed; } #endif @@ -1407,7 +1407,6 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) if (config->daemonize) { /* delay opening/locking of pidfile till filesystems are unfrozen */ s->deferred_options.pid_filepath = config->pid_filepath; - become_daemon(NULL); } if (config->log_filepath) { /* delay opening the log file till filesystems are unfrozen */ @@ -1416,15 +1415,12 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) ga_disable_logging(s); qmp_for_each_command(&ga_commands, ga_disable_not_allowed_freeze, NULL); } else { - if (config->daemonize) { - become_daemon(config->pid_filepath); - } if (config->log_filepath) { FILE *log_file = ga_open_logfile(config->log_filepath); if (!log_file) { g_critical("unable to open specified log file: %s", strerror(errno)); - return NULL; + goto failed; } s->log_file = log_file; } @@ -1435,7 +1431,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) s->pstate_filepath, ga_is_frozen(s))) { g_critical("failed to load persistent state"); - return NULL; + goto failed; } if (config->allowedrpcs) { @@ -1465,7 +1461,7 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) #ifndef _WIN32 if (!register_signal_handlers()) { g_critical("failed to register signal handlers"); - return NULL; + goto failed; } #endif @@ -1478,12 +1474,34 @@ static GAState *initialize_agent(GAConfig *config, int socket_activation) s->wakeup_event = CreateEvent(NULL, TRUE, FALSE, TEXT("WakeUp")); if (s->wakeup_event == NULL) { g_critical("CreateEvent failed"); - return NULL; + goto failed; } #endif + if (!channel_init(s, s->config->method, s->config->channel_path, + s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { + g_critical("failed to initialize guest agent channel"); + return NULL; + } + + if (config->daemonize) { + if (ga_is_frozen(s)) { + become_daemon(NULL); + } else { + become_daemon(config->pid_filepath); + } + } + ga_state = s; return s; +failed: + g_free(s->pstate_filepath); + g_free(s->state_filepath_isfrozen); + if (s->log_file) { + fclose(s->log_file); + } + g_free(s); + return NULL; } static void cleanup_agent(GAState *s) @@ -1508,8 +1526,9 @@ static void cleanup_agent(GAState *s) static int run_agent_once(GAState *s) { - if (!channel_init(s, s->config->method, s->config->channel_path, - s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { + if (!s->channel && + channel_init(s, s->config->method, s->config->channel_path, + s->socket_activation ? FIRST_SOCKET_ACTIVATION_FD : -1)) { g_critical("failed to initialize guest agent channel"); return EXIT_FAILURE; } @@ -1518,6 +1537,7 @@ static int run_agent_once(GAState *s) if (s->channel) { ga_channel_free(s->channel); + s->channel = NULL; } return EXIT_SUCCESS; diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index 3fb4d5b3425c3e2364b7715032adc750a2300e1e..508be19c75816fa27a687d0e1946e62e9f38d5e5 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -24,6 +24,7 @@ MSR_IA32_VMX_TRUE_EXIT_CTLS = 0x48F MSR_IA32_VMX_TRUE_ENTRY_CTLS = 0x490 MSR_IA32_VMX_VMFUNC = 0x491 MSR_IA32_VMX_PROCBASED_CTLS3 = 0x492 +MSR_IA32_VMX_EXIT_CTLS2 = 0x493 class msr(object): def __init__(self): @@ -116,6 +117,7 @@ controls = [ 54: 'INS/OUTS instruction information', 55: 'IA32_VMX_TRUE_*_CTLS support', 56: 'Skip checks on event error code', + 58: 'VMX nested exception support', }, msr = MSR_IA32_VMX_BASIC, ), @@ -219,11 +221,21 @@ controls = [ 23: 'Clear IA32_BNDCFGS', 24: 'Conceal VM exits from PT', 25: 'Clear IA32_RTIT_CTL', + 31: 'Activate secondary VM-exit controls', }, cap_msr = MSR_IA32_VMX_EXIT_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS, ), + Allowed1Control( + name = 'secondary VM-Exit controls', + bits = { + 0: 'Save IA32 FRED MSRs', + 1: 'Load IA32 FRED MSRs', + }, + cap_msr = MSR_IA32_VMX_EXIT_CTLS2, + ), + Control( name = 'VM-Entry controls', bits = { @@ -237,6 +249,7 @@ controls = [ 16: 'Load IA32_BNDCFGS', 17: 'Conceal VM entries from PT', 18: 'Load IA32_RTIT_CTL', + 23: 'Load IA32 FRED MSRs', }, cap_msr = MSR_IA32_VMX_ENTRY_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS, diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 680fa3f581dafcc43d90e2884d5845fc8b9e1017..8604fe8ffa8e668952fc8c761db1da8fd83702e5 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -163,6 +163,7 @@ meson_options_help() { printf "%s\n" ' pixman pixman support' printf "%s\n" ' plugins TCG plugins via shared library loading' printf "%s\n" ' png PNG support with libpng' + printf "%s\n" ' qatzip QATzip compression support' printf "%s\n" ' pvrdma Enable PVRDMA support' printf "%s\n" ' qcow1 qcow1 image format support' printf "%s\n" ' qed qed image format support' @@ -222,6 +223,8 @@ meson_options_help() { printf "%s\n" ' Xen PCI passthrough support' printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' + printf "%s\n" ' qpl Query Processing Library support' + printf "%s\n" ' uadk UADK Library support' } _meson_option_parse() { case $1 in @@ -428,6 +431,8 @@ _meson_option_parse() { --enable-png) printf "%s" -Dpng=enabled ;; --disable-png) printf "%s" -Dpng=disabled ;; --prefix=*) quote_sh "-Dprefix=$2" ;; + --enable-qatzip) printf "%s" -Dqatzip=enabled ;; + --disable-qatzip) printf "%s" -Dqatzip=disabled ;; --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;; --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;; --enable-qcow1) printf "%s" -Dqcow1=enabled ;; @@ -562,6 +567,10 @@ _meson_option_parse() { --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;; --enable-zstd) printf "%s" -Dzstd=enabled ;; --disable-zstd) printf "%s" -Dzstd=disabled ;; + --enable-qpl) printf "%s" -Dqpl=enabled ;; + --disable-qpl) printf "%s" -Dqpl=disabled ;; + --enable-uadk) printf "%s" -Duadk=enabled ;; + --disable-uadk) printf "%s" -Duadk=disabled ;; *) return 1 ;; esac } diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 34295c0fe55b72bbf4db013c4e96262fa1ebfeac..88c76b8f69b07e9b7b587e27e87578686f99a7a9 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -156,6 +156,10 @@ for arch in $ARCHLIST; do cp_portable "$tmpdir/bootparam.h" \ "$output/include/standard-headers/asm-$arch" fi + if [ $arch = loongarch ]; then + cp "$hdrdir/include/asm/kvm_para.h" "$output/linux-headers/asm-loongarch/" + cp "$hdrdir/include/asm/unistd_64.h" "$output/linux-headers/asm-loongarch/" + fi done rm -rf "$output/linux-headers/linux" diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index c6c6347e9b6a5554772908bc4d45f077b9907934..655404fd078147ca05db9d038792e67052c9eceb 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -285,9 +285,12 @@ static void multipath_pr_init(void) static int is_mpath(int fd) { - struct dm_ioctl dm = { .flags = DM_NOFLUSH_FLAG }; + struct dm_ioctl dm; struct dm_target_spec *tgt; + memset(&dm, 0, sizeof(struct dm_ioctl)); + dm.flags = DM_NOFLUSH_FLAG; + tgt = dm_dev_ioctl(fd, DM_TABLE_STATUS, &dm); if (!tgt) { if (errno == ENXIO) { diff --git a/system/cpus.c b/system/cpus.c index a444a747f0164ee9a998ef4b25b49a2efc4607a8..d9de09b9e87b5f5d4eab095e38f7ca2ed942b2ba 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -193,6 +193,20 @@ void cpu_synchronize_pre_loadvm(CPUState *cpu) } } +void cpus_control_pre_system_reset(void) +{ + if (cpus_accel->control_pre_system_reset) { + cpus_accel->control_pre_system_reset(); + } +} + +void cpus_control_post_system_reset(void) +{ + if (cpus_accel->control_post_system_reset) { + cpus_accel->control_post_system_reset(); + } +} + bool cpus_are_resettable(void) { if (cpus_accel->cpus_are_resettable) { @@ -551,12 +565,14 @@ static bool all_vcpus_paused(void) return true; } -void pause_all_vcpus(void) +static void request_pause_all_vcpus(void) { CPUState *cpu; - qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); CPU_FOREACH(cpu) { + if (cpu->stopped) { + continue; + } if (qemu_cpu_is_self(cpu)) { qemu_cpu_stop(cpu, true); } else { @@ -564,6 +580,14 @@ void pause_all_vcpus(void) qemu_cpu_kick(cpu); } } +} + +void pause_all_vcpus(void) +{ + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); + +retry: + request_pause_all_vcpus(); /* We need to drop the replay_lock so any vCPU threads woken up * can finish their replay tasks @@ -572,14 +596,23 @@ void pause_all_vcpus(void) while (!all_vcpus_paused()) { qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex); - CPU_FOREACH(cpu) { - qemu_cpu_kick(cpu); - } + /* During we waited on qemu_pause_cond the bql was unlocked, + * the vcpu's state may has been changed by other thread, so + * we must request the pause state on all vcpus again. + */ + request_pause_all_vcpus(); } qemu_mutex_unlock_iothread(); replay_mutex_lock(); qemu_mutex_lock_iothread(); + + /* During the bql was unlocked, the vcpu's state may has been + * changed by other thread, so we must retry. + */ + if (!all_vcpus_paused()) { + goto retry; + } } void cpu_resume(CPUState *cpu) @@ -599,6 +632,9 @@ void resume_all_vcpus(void) qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true); CPU_FOREACH(cpu) { + if (!object_property_get_bool(OBJECT(cpu), "realized", &error_abort)) { + continue; + } cpu_resume(cpu); } } diff --git a/system/dma-helpers.c b/system/dma-helpers.c index 36211acc7eaeb2a9ac16375daf5f3f59bd64c329..611ea04ffb7459dd485f30f1fff8a5f2f5d27c5c 100644 --- a/system/dma-helpers.c +++ b/system/dma-helpers.c @@ -167,7 +167,7 @@ static void dma_blk_cb(void *opaque, int ret) if (dbs->iov.size == 0) { trace_dma_map_wait(dbs); dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs); - cpu_register_map_client(dbs->bh); + address_space_register_map_client(dbs->sg->as, dbs->bh); goto out; } @@ -197,7 +197,7 @@ static void dma_aio_cancel(BlockAIOCB *acb) } if (dbs->bh) { - cpu_unregister_map_client(dbs->bh); + address_space_unregister_map_client(dbs->sg->as, dbs->bh); qemu_bh_delete(dbs->bh); dbs->bh = NULL; } diff --git a/system/main.c b/system/main.c index 9b91d21ea8c9186f2c80ac604542731c12cf05f1..28bb283ebfd609c1c7590cf360cbe05112b78f1c 100644 --- a/system/main.c +++ b/system/main.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "qemu-main.h" #include "sysemu/sysemu.h" @@ -34,6 +35,7 @@ int qemu_default_main(void) { int status; + qemu_log("qemu enter main_loop\n"); status = qemu_main_loop(); qemu_cleanup(status); diff --git a/system/memory.c b/system/memory.c index 798b6c0a171b71db8acb9aea503ea21ac26a07fe..607ce9cf60c4117cd08e8a1ac6e0a33cfdf63127 100644 --- a/system/memory.c +++ b/system/memory.c @@ -48,6 +48,9 @@ static QTAILQ_HEAD(, MemoryListener) memory_listeners static QTAILQ_HEAD(, AddressSpace) address_spaces = QTAILQ_HEAD_INITIALIZER(address_spaces); +static QTAILQ_HEAD(, SharedRegionListener) shared_region_listeners + = QTAILQ_HEAD_INITIALIZER(shared_region_listeners); + static GHashTable *flat_views; typedef struct AddrRange AddrRange; @@ -853,6 +856,13 @@ static void address_space_update_ioeventfds(AddressSpace *as) return; } + view = address_space_get_flatview(as); + if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) { + flatview_unref(view); + return; + } + view->flags |= FLATVIEW_FLAG_LAST_PROCESSED; + /* * It is likely that the number of ioeventfds hasn't changed much, so use * the previous size as the starting value, with some headroom to avoid @@ -861,7 +871,6 @@ static void address_space_update_ioeventfds(AddressSpace *as) ioeventfd_max = QEMU_ALIGN_UP(as->ioeventfd_nb, 4); ioeventfds = g_new(MemoryRegionIoeventfd, ioeventfd_max); - view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { for (i = 0; i < fr->mr->ioeventfd_nb; ++i) { tmp = addrrange_shift(fr->mr->ioeventfds[i].addr, @@ -1108,40 +1117,64 @@ static void address_space_update_topology(AddressSpace *as) address_space_set_flatview(as); } +static void address_space_update_view(AddressSpace *as) +{ + FlatView *view; + + view = address_space_get_flatview(as); + if (view->flags & FLATVIEW_FLAG_LAST_PROCESSED) { + view->flags &= ~FLATVIEW_FLAG_LAST_PROCESSED; + } + flatview_unref(view); +} + void memory_region_transaction_begin(void) { qemu_flush_coalesced_mmio_buffer(); ++memory_region_transaction_depth; } -void memory_region_transaction_commit(void) +void memory_region_commit(void) { AddressSpace *as; + if (memory_region_update_pending) { + flatviews_reset(); + + MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); + + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_set_flatview(as); + address_space_update_ioeventfds(as); + } + memory_region_update_pending = false; + ioeventfd_update_pending = false; + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_view(as); + } + MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); + } else if (ioeventfd_update_pending) { + MEMORY_LISTENER_CALL_GLOBAL(eventfd_begin, Forward); + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_ioeventfds(as); + } + ioeventfd_update_pending = false; + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + address_space_update_view(as); + } + MEMORY_LISTENER_CALL_GLOBAL(eventfd_end, Forward); + } +} + +void memory_region_transaction_commit(void) +{ assert(memory_region_transaction_depth); assert(qemu_mutex_iothread_locked()); --memory_region_transaction_depth; if (!memory_region_transaction_depth) { - if (memory_region_update_pending) { - flatviews_reset(); - - MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); - - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_set_flatview(as); - address_space_update_ioeventfds(as); - } - memory_region_update_pending = false; - ioeventfd_update_pending = false; - MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); - } else if (ioeventfd_update_pending) { - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_update_ioeventfds(as); - } - ioeventfd_update_pending = false; - } - } + memory_region_commit(); + } } static void memory_region_destructor_none(MemoryRegion *mr) @@ -2226,6 +2259,21 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, return true; } +void shared_region_register_listener(SharedRegionListener *shl) +{ + QTAILQ_INSERT_TAIL(&shared_region_listeners, shl, next); +} + +void shared_region_unregister_listener(SharedRegionListener *shl) +{ + QTAILQ_REMOVE(&shared_region_listeners, shl, next); +} + +void *shared_region_listeners_get(void) +{ + return &shared_region_listeners; +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; @@ -2253,6 +2301,12 @@ void memory_region_set_dirty(MemoryRegion *mr, hwaddr addr, memory_region_get_dirty_log_mask(mr)); } +bool is_first_section(MemoryRegionSection *section) +{ + return section->fv->ranges->addr.start == section->offset_within_address_space && + section->fv->ranges->addr.size == section->size; +} + /* * If memory region `mr' is NULL, do global sync. Otherwise, sync * dirty bitmap for the specified memory region. @@ -3117,19 +3171,31 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); + as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE; + as->bounce_buffer_size = 0; + qemu_mutex_init(&as->map_client_list_lock); + QLIST_INIT(&as->map_client_list); as->name = g_strdup(name ? name : "anonymous"); address_space_update_topology(as); address_space_update_ioeventfds(as); + address_space_update_view(as); } static void do_address_space_destroy(AddressSpace *as) { + assert(qatomic_read(&as->bounce_buffer_size) == 0); + assert(QLIST_EMPTY(&as->map_client_list)); + qemu_mutex_destroy(&as->map_client_list_lock); + assert(QTAILQ_EMPTY(&as->listeners)); flatview_unref(as->current_map); g_free(as->name); g_free(as->ioeventfds); memory_region_unref(as->root); + if (as->free_in_rcu) { + g_free(as); + } } void address_space_destroy(AddressSpace *as) diff --git a/system/physmem.c b/system/physmem.c index a63853a7bc9d5c4fc2ee06f5fe56222c36490fe0..8f4be2d1310c6cb007ae7b4db6ff135d475cf6de 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -89,9 +89,17 @@ RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; static MemoryRegion *system_memory; static MemoryRegion *system_io; +static MemoryRegion *virtcca_shared_memory; + +/* + * Serves as the sub-MR of the root MR (virtcca_shared_memory) + * and is associated with the RAMBlock. + */ +MemoryRegion *virtcca_shared_hugepage; AddressSpace address_space_io; AddressSpace address_space_memory; +AddressSpace address_space_virtcca_shared_memory; static MemoryRegion io_mem_unassigned; @@ -761,6 +769,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + cpu->cpu_ases_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -774,6 +783,34 @@ void cpu_address_space_init(CPUState *cpu, int asidx, } } +void cpu_address_space_destroy(CPUState *cpu, int asidx) +{ + CPUAddressSpace *cpuas; + + assert(cpu->cpu_ases); + assert(asidx >= 0 && asidx < cpu->num_ases); + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); + + cpuas = &cpu->cpu_ases[asidx]; + if (tcg_enabled()) { + memory_listener_unregister(&cpuas->tcg_as_listener); + } + + address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); + + if (asidx == 0) { + /* reset the convenience alias for address space 0 */ + cpu->as = NULL; + } + + if (--cpu->cpu_ases_count == 0) { + g_free(cpu->cpu_ases); + cpu->cpu_ases = NULL; + } +} + AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) { /* Return the AddressSpace corresponding to the specified index */ @@ -1329,7 +1366,14 @@ static int file_ram_open(const char *path, /* @path names a file that doesn't exist, create it */ fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); if (fd >= 0) { - *created = true; + info_report("open %s success \n", path); + /* if fd file type is HUGETLBFS_MAGIC, unlink it, */ + /* in case to prevent residue after qemu killed */ + if (qemu_fd_getfiletype(fd) == HUGETLBFS_MAGIC) { + unlink(path); + } else { + *created = true; + } break; } } else if (errno == EISDIR) { @@ -1499,18 +1543,6 @@ static ram_addr_t find_ram_offset(ram_addr_t size) return offset; } -static unsigned long last_ram_page(void) -{ - RAMBlock *block; - ram_addr_t last = 0; - - RCU_READ_LOCK_GUARD(); - RAMBLOCK_FOREACH(block) { - last = MAX(last, block->offset + block->max_length); - } - return last >> TARGET_PAGE_BITS; -} - static void qemu_ram_setup_dump(void *addr, ram_addr_t size) { int ret; @@ -1763,13 +1795,11 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length) } /* Called with ram_list.mutex held */ -static void dirty_memory_extend(ram_addr_t old_ram_size, - ram_addr_t new_ram_size) +static void dirty_memory_extend(ram_addr_t new_ram_size) { - ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); - ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); + unsigned int old_num_blocks = ram_list.num_dirty_blocks; + unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size, + DIRTY_MEMORY_BLOCK_SIZE); int i; /* Only need to extend if block count increased */ @@ -1801,6 +1831,8 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, g_free_rcu(old_blocks, rcu); } } + + ram_list.num_dirty_blocks = new_num_blocks; } static void ram_block_add(RAMBlock *new_block, Error **errp) @@ -1809,11 +1841,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) const bool shared = qemu_ram_is_shared(new_block); RAMBlock *block; RAMBlock *last_block = NULL; - ram_addr_t old_ram_size, new_ram_size; + ram_addr_t ram_size; Error *err = NULL; - old_ram_size = last_ram_page(); - qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); @@ -1841,11 +1871,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) } } - new_ram_size = MAX(old_ram_size, - (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); - if (new_ram_size > old_ram_size) { - dirty_memory_extend(old_ram_size, new_ram_size); - } + ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; + dirty_memory_extend(ram_size); /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. @@ -2231,6 +2258,10 @@ RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, ram_addr_t ram_addr; RCU_READ_LOCK_GUARD(); ram_addr = xen_ram_addr_from_mapcache(ptr); + if (ram_addr == RAM_ADDR_INVALID) { + return NULL; + } + block = qemu_get_ram_block(ram_addr); if (block) { *offset = ram_addr - block->offset; @@ -2563,6 +2594,15 @@ static void memory_map_init(void) address_space_init(&address_space_io, system_io, "I/O"); } +void virtcca_shared_memory_address_space_init(void) +{ + virtcca_shared_memory = g_malloc(sizeof(*virtcca_shared_memory)); + memory_region_init(virtcca_shared_memory, NULL, + "virtcca_shared_memory", UINT64_MAX); + address_space_init(&address_space_virtcca_shared_memory, + virtcca_shared_memory, "virtcca_shared_memory"); +} + MemoryRegion *get_system_memory(void) { return system_memory; @@ -2595,6 +2635,17 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); } +int64_t memory_section_set_dirty_bytemap(MemoryRegionSection *section, unsigned long *bytemap) +{ + ram_addr_t start = section->offset_within_region + + memory_region_get_ram_addr(section->mr); + ram_addr_t pages = int128_get64(section->size) >> TARGET_PAGE_BITS; + + hwaddr idx = BYTE_WORD( + section->offset_within_address_space >> TARGET_PAGE_BITS); + return cpu_physical_memory_set_dirty_bytemap(bytemap + idx, start, pages); +} + void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size) { /* @@ -2974,55 +3025,51 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len) NULL, len, FLUSH_CACHE); } +/* + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used + * to detect illegal pointers passed to address_space_unmap. + */ +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed + typedef struct { + uint64_t magic; MemoryRegion *mr; - void *buffer; hwaddr addr; - hwaddr len; - bool in_use; + size_t len; + uint8_t buffer[]; } BounceBuffer; -static BounceBuffer bounce; - -typedef struct MapClient { - QEMUBH *bh; - QLIST_ENTRY(MapClient) link; -} MapClient; - -QemuMutex map_client_list_lock; -static QLIST_HEAD(, MapClient) map_client_list - = QLIST_HEAD_INITIALIZER(map_client_list); - -static void cpu_unregister_map_client_do(MapClient *client) +static void +address_space_unregister_map_client_do(AddressSpaceMapClient *client) { QLIST_REMOVE(client, link); g_free(client); } -static void cpu_notify_map_clients_locked(void) +static void address_space_notify_map_clients_locked(AddressSpace *as) { - MapClient *client; + AddressSpaceMapClient *client; - while (!QLIST_EMPTY(&map_client_list)) { - client = QLIST_FIRST(&map_client_list); + while (!QLIST_EMPTY(&as->map_client_list)) { + client = QLIST_FIRST(&as->map_client_list); qemu_bh_schedule(client->bh); - cpu_unregister_map_client_do(client); + address_space_unregister_map_client_do(client); } } -void cpu_register_map_client(QEMUBH *bh) +void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) { - MapClient *client = g_malloc(sizeof(*client)); + AddressSpaceMapClient *client = g_malloc(sizeof(*client)); - qemu_mutex_lock(&map_client_list_lock); + qemu_mutex_lock(&as->map_client_list_lock); client->bh = bh; - QLIST_INSERT_HEAD(&map_client_list, client, link); - /* Write map_client_list before reading in_use. */ + QLIST_INSERT_HEAD(&as->map_client_list, client, link); + /* Write map_client_list before reading bounce_buffer_size. */ smp_mb(); - if (!qatomic_read(&bounce.in_use)) { - cpu_notify_map_clients_locked(); + if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) { + address_space_notify_map_clients_locked(as); } - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_unlock(&as->map_client_list_lock); } void cpu_exec_init_all(void) @@ -3038,28 +3085,27 @@ void cpu_exec_init_all(void) finalize_target_page_bits(); io_mem_init(); memory_map_init(); - qemu_mutex_init(&map_client_list_lock); } -void cpu_unregister_map_client(QEMUBH *bh) +void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh) { - MapClient *client; + AddressSpaceMapClient *client; - qemu_mutex_lock(&map_client_list_lock); - QLIST_FOREACH(client, &map_client_list, link) { + qemu_mutex_lock(&as->map_client_list_lock); + QLIST_FOREACH(client, &as->map_client_list, link) { if (client->bh == bh) { - cpu_unregister_map_client_do(client); + address_space_unregister_map_client_do(client); break; } } - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_unlock(&as->map_client_list_lock); } -static void cpu_notify_map_clients(void) +static void address_space_notify_map_clients(AddressSpace *as) { - qemu_mutex_lock(&map_client_list_lock); - cpu_notify_map_clients_locked(); - qemu_mutex_unlock(&map_client_list_lock); + qemu_mutex_lock(&as->map_client_list_lock); + address_space_notify_map_clients_locked(as); + qemu_mutex_unlock(&as->map_client_list_lock); } static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, @@ -3126,8 +3172,8 @@ flatview_extend_translation(FlatView *fv, hwaddr addr, * May map a subset of the requested range, given by and returned in *plen. * May return NULL if resources needed to perform the mapping are exhausted. * Use only for reads OR writes - not for read-modify-write operations. - * Use cpu_register_map_client() to know when retrying the map operation is - * likely to succeed. + * Use address_space_register_map_client() to know when retrying the map + * operation is likely to succeed. */ void *address_space_map(AddressSpace *as, hwaddr addr, @@ -3150,28 +3196,40 @@ void *address_space_map(AddressSpace *as, mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); if (!memory_access_is_direct(mr, is_write)) { - if (qatomic_xchg(&bounce.in_use, true)) { + size_t used = qatomic_read(&as->bounce_buffer_size); + for (;;) { + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); + size_t new_size = used + alloc; + size_t actual = + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); + if (actual == used) { + l = alloc; + break; + } + used = actual; + } + + if (l == 0) { *plen = 0; return NULL; } - /* Avoid unbounded allocations */ - l = MIN(l, TARGET_PAGE_SIZE); - bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); - bounce.addr = addr; - bounce.len = l; + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); + bounce->magic = BOUNCE_BUFFER_MAGIC; memory_region_ref(mr); - bounce.mr = mr; + bounce->mr = mr; + bounce->addr = addr; + bounce->len = l; + if (!is_write) { flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, - bounce.buffer, l); + bounce->buffer, l); } *plen = l; - return bounce.buffer; + return bounce->buffer; } - memory_region_ref(mr); *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs); @@ -3186,12 +3244,11 @@ void *address_space_map(AddressSpace *as, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len) { - if (buffer != bounce.buffer) { - MemoryRegion *mr; - ram_addr_t addr1; + MemoryRegion *mr; + ram_addr_t addr1; - mr = memory_region_from_host(buffer, &addr1); - assert(mr != NULL); + mr = memory_region_from_host(buffer, &addr1); + if (mr != NULL) { if (is_write) { invalidate_and_set_dirty(mr, addr1, access_len); } @@ -3201,16 +3258,23 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, memory_region_unref(mr); return; } + + + BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer); + assert(bounce->magic == BOUNCE_BUFFER_MAGIC); + if (is_write) { - address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED, - bounce.buffer, access_len); - } - qemu_vfree(bounce.buffer); - bounce.buffer = NULL; - memory_region_unref(bounce.mr); - /* Clear in_use before reading map_client_list. */ - qatomic_set_mb(&bounce.in_use, false); - cpu_notify_map_clients(); + address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED, + bounce->buffer, access_len); + } + + qatomic_sub(&as->bounce_buffer_size, bounce->len); + bounce->magic = ~BOUNCE_BUFFER_MAGIC; + memory_region_unref(bounce->mr); + g_free(bounce); + /* Write bounce_buffer_size before reading map_client_list. */ + smp_mb(); + address_space_notify_map_clients(as); } void *cpu_physical_memory_map(hwaddr addr, diff --git a/system/qdev-monitor.c b/system/qdev-monitor.c index a13db763e5dd0fbdc3690ef1bc3bb1bbfffade33..5b35704b5ea54efb2ef25f04249c7b49cec4852f 100644 --- a/system/qdev-monitor.c +++ b/system/qdev-monitor.c @@ -36,6 +36,7 @@ #include "qemu/option.h" #include "qemu/qemu-print.h" #include "qemu/option_int.h" +#include "qemu/log.h" #include "sysemu/block-backend.h" #include "migration/misc.h" #include "migration/migration.h" @@ -643,6 +644,7 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, if (path != NULL) { bus = qbus_find(path, errp); if (!bus) { + qemu_log("can not find bus for %s\n", driver); return NULL; } if (!object_dynamic_cast(OBJECT(bus), dc->bus_type)) { @@ -713,9 +715,11 @@ DeviceState *qdev_device_add_from_qdict(const QDict *opts, object_set_properties_from_keyval(&dev->parent_obj, dev->opts, from_json, errp); if (*errp) { + qemu_log("the bus %s -driver %s set property failed\n", + bus ? bus->name : "None", driver); goto err_del_dev; } - + qemu_log("add qdev %s:%s success\n", driver, dev->id ? dev->id : "none"); if (!qdev_realize(dev, bus, errp)) { goto err_del_dev; } @@ -737,6 +741,8 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) ret = qdev_device_add_from_qdict(qdict, false, errp); if (ret) { + qemu_log("add qdev %s:%s success\n", qemu_opt_get(opts, "driver"), + qemu_opts_id(opts) ? qemu_opts_id(opts) : "none"); qemu_opts_del(opts); } qobject_unref(qdict); diff --git a/system/rtc.c b/system/rtc.c index 4904581abeb8accb97fd5d9b20c337c8dbec0b48..e16b5fffc59c8036f08d1c950f5365e703f270de 100644 --- a/system/rtc.c +++ b/system/rtc.c @@ -44,6 +44,7 @@ static time_t rtc_ref_start_datetime; static int rtc_realtime_clock_offset; /* used only with QEMU_CLOCK_REALTIME */ static int rtc_host_datetime_offset = -1; /* valid & used only with RTC_BASE_DATETIME */ +static time_t rtc_date_diff = 0; QEMUClockType rtc_clock; /***********************************************************/ /* RTC reference time/date access */ @@ -108,6 +109,16 @@ time_t qemu_timedate_diff(struct tm *tm) return seconds - qemu_ref_timedate(QEMU_CLOCK_HOST); } +time_t get_rtc_date_diff(void) +{ + return rtc_date_diff; +} + +void set_rtc_date_diff(time_t diff) +{ + rtc_date_diff = diff; +} + static void configure_rtc_base_datetime(const char *startdate) { time_t rtc_start_datetime; diff --git a/system/runstate.c b/system/runstate.c index ea9d6c2a32a45541a87cecaba569583f60624ea2..7e41626bb15b54b4a86fe4d50ac5388aa168544c 100644 --- a/system/runstate.c +++ b/system/runstate.c @@ -116,6 +116,7 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_PRELAUNCH, RUN_STATE_RUNNING }, { RUN_STATE_PRELAUNCH, RUN_STATE_FINISH_MIGRATE }, { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE }, + { RUN_STATE_PRELAUNCH, RUN_STATE_POSTMIGRATE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED }, @@ -486,6 +487,8 @@ void qemu_system_reset(ShutdownCause reason) mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL; + cpus_control_pre_system_reset(); + cpu_synchronize_all_states(); if (mc && mc->reset) { @@ -502,6 +505,10 @@ void qemu_system_reset(ShutdownCause reason) qapi_event_send_reset(shutdown_caused_by_guest(reason), reason); } cpu_synchronize_all_post_reset(); + + cpus_control_post_system_reset(); + + monitor_qapi_event_discard_io_error(); } /* @@ -767,9 +774,11 @@ static bool main_loop_should_exit(int *status) } if (qemu_powerdown_requested()) { qemu_system_powerdown(); + qemu_log("domain is power down by outside operation\n"); } if (qemu_vmstop_requested(&r)) { vm_stop(r); + qemu_log("domain is stopped by outside operation\n"); } return false; } diff --git a/system/vl.c b/system/vl.c index 2bcd9efb9a64396520576ae5ceaf0523c991a4ca..7c10cd13372302f5069279b7669e30e9c857eee7 100644 --- a/system/vl.c +++ b/system/vl.c @@ -26,6 +26,7 @@ #include "qemu/help-texts.h" #include "qemu/datadir.h" #include "qemu/units.h" +#include "qemu/log.h" #include "exec/cpu-common.h" #include "exec/page-vary.h" #include "hw/qdev-properties.h" @@ -172,6 +173,8 @@ static QemuPluginList plugin_list = QTAILQ_HEAD_INITIALIZER(plugin_list); static BlockdevOptionsQueue bdo_queue = QSIMPLEQ_HEAD_INITIALIZER(bdo_queue); static bool nographic = false; static int mem_prealloc; /* force preallocation of physical target memory */ +static ram_addr_t ram2_base; +static ram_addr_t ram2_size; static const char *vga_model = NULL; static DisplayOptions dpy; static int num_serial_hds; @@ -503,6 +506,23 @@ static QemuOptsList qemu_action_opts = { }, }; +static QemuOptsList qemu_mem2_opts = { + .name = "mem2", + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(qemu_mem2_opts.head), + .desc = { + { + .name = "base", + .type = QEMU_OPT_SIZE, + }, + { + .name = "size", + .type = QEMU_OPT_SIZE, + }, + { /* end of list */ } + }, +}; + const char *qemu_get_vm_name(void) { return qemu_name; @@ -993,9 +1013,16 @@ static bool vga_interface_available(VGAInterfaceType t) const VGAInterfaceInfo *ti = &vga_interfaces[t]; assert(t < VGA_TYPE_MAX); - return !ti->class_names[0] || - module_object_class_by_name(ti->class_names[0]) || - module_object_class_by_name(ti->class_names[1]); + + if (!ti->class_names[0] || module_object_class_by_name(ti->class_names[0])) { + return true; + } + + if (ti->class_names[1] && module_object_class_by_name(ti->class_names[1])) { + return true; + } + + return false; } static const char * @@ -1924,6 +1951,9 @@ static void qemu_apply_machine_options(QDict *qdict) { object_set_properties_from_keyval(OBJECT(current_machine), qdict, false, &error_fatal); + current_machine->ram2_size = ram2_size; + current_machine->ram2_base = ram2_base; + if (semihosting_enabled(false) && !semihosting_get_argc()) { /* fall back to the -kernel/-append */ semihosting_arg_fallback(current_machine->kernel_filename, current_machine->kernel_cmdline); @@ -2086,11 +2116,57 @@ static void parse_memory_options(void) loc_pop(&loc); } +static void set_mem2_options(void) +{ + uint64_t sz, base; + const char *mem_str; + QemuOpts *opts = qemu_find_opts_singleton("mem2"); + Location loc; + + loc_push_none(&loc); + qemu_opts_loc_restore(opts); + + mem_str = qemu_opt_get(opts, "base"); + if (mem_str) { + if (!*mem_str) { + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + base = qemu_opt_get_size(opts, "base", ram2_base); + ram2_base = base; + } + + mem_str = qemu_opt_get(opts, "size"); + if (mem_str) { + if (!*mem_str) { + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + sz = qemu_opt_get_size(opts, "size", ram2_size); + ram2_size = sz; + } + + if (ram2_base && !ram2_size){ + error_report("missing 'size' option value"); + exit(EXIT_FAILURE); + } + if (!ram2_base && ram2_size){ + error_report("missing 'base' option value"); + exit(EXIT_FAILURE); + } + + loc_pop(&loc); +} + static void qemu_create_machine(QDict *qdict) { MachineClass *machine_class = select_machine(qdict, &error_fatal); object_set_machine_compat_props(machine_class->compat_props); + set_mem2_options(); + current_machine = MACHINE(object_new_with_class(OBJECT_CLASS(machine_class))); object_property_add_child(object_get_root(), "machine", OBJECT(current_machine)); @@ -2633,6 +2709,7 @@ static void qemu_create_cli_devices(void) } /* init generic devices */ + qemu_log("device init start\n"); rom_set_order_override(FW_CFG_ORDER_OVERRIDE_DEVICE); qemu_opts_foreach(qemu_find_opts("device"), device_init_func, NULL, &error_fatal); @@ -2768,6 +2845,7 @@ void qemu_init(int argc, char **argv) qemu_add_opts(&qemu_semihosting_config_opts); qemu_add_opts(&qemu_fw_cfg_opts); qemu_add_opts(&qemu_action_opts); + qemu_add_opts(&qemu_mem2_opts); qemu_add_run_with_opts(); module_call_init(MODULE_INIT_OPTS); @@ -2778,6 +2856,7 @@ void qemu_init(int argc, char **argv) qemu_init_subsystems(); + qemu_log("qemu pid is %d, options parsing start\n", getpid()); /* first pass of option parsing */ optind = 1; while (optind < argc) { @@ -2997,6 +3076,7 @@ void qemu_init(int argc, char **argv) exit(0); break; case QEMU_OPTION_m: + qemu_log("memory options parse start\n"); opts = qemu_opts_parse_noisily(qemu_find_opts("memory"), optarg, true); if (opts == NULL) { exit(1); @@ -3585,6 +3665,13 @@ void qemu_init(int argc, char **argv) case QEMU_OPTION_nouserconfig: /* Nothing to be parsed here. Especially, do not error out below. */ break; + case QEMU_OPTION_mem2: + opts = qemu_opts_parse_noisily(qemu_find_opts("mem2"), + optarg, false); + if (!opts) { + exit(EXIT_FAILURE); + } + break; #if defined(CONFIG_POSIX) case QEMU_OPTION_runas: if (!os_set_runas(optarg)) { @@ -3697,6 +3784,15 @@ void qemu_init(int argc, char **argv) configure_accelerators(argv[0]); phase_advance(PHASE_ACCEL_CREATED); + /* + * Must run after kvm_init completes, as virtcca_cvm_enabled() + * depends on initialization performed in kvm_init. + */ + if (virtcca_cvm_enabled()) { + virtcca_cvm_ram_size = current_machine->ram_size; + virtcca_shared_memory_address_space_init(); + } + /* * Beware, QOM objects created before this point miss global and * compat properties. @@ -3714,6 +3810,7 @@ void qemu_init(int argc, char **argv) */ machine_class = MACHINE_GET_CLASS(current_machine); + qemu_log("configure accelerator %s start\n", machine_class->name); if (!qtest_enabled() && machine_class->deprecation_reason) { warn_report("Machine type '%s' is deprecated: %s", machine_class->name, machine_class->deprecation_reason); @@ -3732,6 +3829,7 @@ void qemu_init(int argc, char **argv) */ migration_object_init(); + qemu_log("machine init start\n"); /* parse features once if machine provides default cpu_type */ current_machine->cpu_type = machine_class->default_cpu_type; if (cpu_option) { diff --git a/target/arm/arm-powerctl.c b/target/arm/arm-powerctl.c index c078849403c7751e303a620d15f63da549f75919..fb19b04189293722795432903fe7da4dbff0cf2a 100644 --- a/target/arm/arm-powerctl.c +++ b/target/arm/arm-powerctl.c @@ -16,6 +16,7 @@ #include "qemu/log.h" #include "qemu/main-loop.h" #include "sysemu/tcg.h" +#include "hw/boards.h" #ifndef DEBUG_ARM_POWERCTL #define DEBUG_ARM_POWERCTL 0 @@ -28,18 +29,37 @@ } \ } while (0) +static CPUArchId *arm_get_archid_by_id(uint64_t id) +{ + int n; + CPUArchId *arch_id; + MachineState *ms = MACHINE(qdev_get_machine()); + + /* + * At this point disabled CPUs don't have a CPUState, but their CPUArchId + * exists. + * + * TODO: Is arch_id == mp_affinity? This needs work. + */ + for (n = 0; n < ms->possible_cpus->len; n++) { + arch_id = &ms->possible_cpus->cpus[n]; + + if (arch_id->arch_id == id) { + return arch_id; + } + } + return NULL; +} + CPUState *arm_get_cpu_by_id(uint64_t id) { - CPUState *cpu; + CPUArchId *arch_id; DPRINTF("cpu %" PRId64 "\n", id); - CPU_FOREACH(cpu) { - ARMCPU *armcpu = ARM_CPU(cpu); - - if (armcpu->mp_affinity == id) { - return cpu; - } + arch_id = arm_get_archid_by_id(id); + if (arch_id && arch_id->cpu) { + return CPU(arch_id->cpu); } qemu_log_mask(LOG_GUEST_ERROR, @@ -97,6 +117,7 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, { CPUState *target_cpu_state; ARMCPU *target_cpu; + CPUArchId *arch_id; struct CpuOnInfo *info; assert(qemu_mutex_iothread_locked()); @@ -117,12 +138,24 @@ int arm_set_cpu_on(uint64_t cpuid, uint64_t entry, uint64_t context_id, } /* Retrieve the cpu we are powering up */ - target_cpu_state = arm_get_cpu_by_id(cpuid); - if (!target_cpu_state) { + arch_id = arm_get_archid_by_id(cpuid); + if (!arch_id) { /* The cpu was not found */ return QEMU_ARM_POWERCTL_INVALID_PARAM; } + target_cpu_state = CPU(arch_id->cpu); + if (!qemu_enabled_cpu(target_cpu_state)) { + /* + * The cpu is not plugged in or disabled. We should return appropriate + * value as introduced in DEN0022E PSCI 1.2 issue E + */ + qemu_log_mask(LOG_GUEST_ERROR, + "[ARM]%s: Denying attempt to online removed/disabled " + "CPU%" PRId64"\n", __func__, cpuid); + return QEMU_ARM_POWERCTL_IS_OFF; + } + target_cpu = ARM_CPU(target_cpu_state); if (target_cpu->power_state == PSCI_ON) { qemu_log_mask(LOG_GUEST_ERROR, diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h index 954d3582685c44eb156809f013a520603d721d08..165a497f7b9151562a143152f7f0dbd822e25a26 100644 --- a/target/arm/cpu-features.h +++ b/target/arm/cpu-features.h @@ -771,7 +771,7 @@ static inline bool isar_feature_aa64_hcx(const ARMISARegisters *id) static inline bool isar_feature_aa64_tidcp1(const ARMISARegisters *id) { - return FIELD_EX64(id->id_aa64mmfr2, ID_AA64MMFR1, TIDCP1) != 0; + return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, TIDCP1) != 0; } static inline bool isar_feature_aa64_hafs(const ARMISARegisters *id) diff --git a/target/arm/cpu.c b/target/arm/cpu.c index efb22a87f9eda26e212b6a7895d35d67461c6c08..09d391bd348aaf9c7e59a58d35487a1e68b540e6 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -142,6 +142,16 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, QLIST_INSERT_HEAD(&cpu->pre_el_change_hooks, entry, node); } +void arm_unregister_pre_el_change_hooks(ARMCPU *cpu) +{ + ARMELChangeHook *entry, *next; + + QLIST_FOREACH_SAFE(entry, &cpu->pre_el_change_hooks, node, next) { + QLIST_REMOVE(entry, node); + g_free(entry); + } +} + void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque) { @@ -153,6 +163,16 @@ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, QLIST_INSERT_HEAD(&cpu->el_change_hooks, entry, node); } +void arm_unregister_el_change_hooks(ARMCPU *cpu) +{ + ARMELChangeHook *entry, *next; + + QLIST_FOREACH_SAFE(entry, &cpu->el_change_hooks, node, next) { + QLIST_REMOVE(entry, node); + g_free(entry); + } +} + static void cp_reg_reset(gpointer key, gpointer value, gpointer opaque) { /* Reset a single ARMCPRegInfo register */ @@ -1615,6 +1635,10 @@ void arm_cpu_post_init(Object *obj) } } else if (cpu_isar_feature(aa32_vfp, cpu)) { cpu->has_vfp = true; + if (tcg_enabled() || qtest_enabled()) { + qdev_property_add_static(DEVICE(obj), + &arm_cpu_has_vfp_property); + } if (cpu_isar_feature(aa32_simd_r32, cpu)) { cpu->has_vfp_d32 = true; /* @@ -2390,6 +2414,94 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) acc->parent_realize(dev, errp); } +static void arm_cpu_unrealizefn(DeviceState *dev) +{ + ARMCPUClass *acc = ARM_CPU_GET_CLASS(dev); + ARMCPU *cpu = ARM_CPU(dev); + CPUARMState *env = &cpu->env; + CPUState *cs = CPU(dev); + bool has_secure; + +#ifndef CONFIG_USER_ONLY + has_secure = cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY); + + /* rock 'n' un-roll, whatever happened in the arm_cpu_realizefn cleanly */ + cpu_address_space_destroy(cs, ARMASIdx_NS); + + if (cpu->tag_memory != NULL) { + cpu_address_space_destroy(cs, ARMASIdx_TagNS); + if (has_secure) { + cpu_address_space_destroy(cs, ARMASIdx_TagS); + } + } + + if (has_secure) { + cpu_address_space_destroy(cs, ARMASIdx_S); + } +#endif + + destroy_cpreg_list(cpu); + arm_cpu_unregister_gdb_regs(cpu); + unregister_cp_regs_for_features(cpu); + +#ifndef CONFIG_USER_ONLY + if (tcg_enabled() && cpu_isar_feature(aa64_rme, cpu)) { + arm_unregister_el_change_hooks(cpu); + } +#endif + + if (cpu->sau_sregion && arm_feature(env, ARM_FEATURE_M_SECURITY)) { + g_free(env->sau.rbar); + g_free(env->sau.rlar); + } + + if (arm_feature(env, ARM_FEATURE_PMSA) && + arm_feature(env, ARM_FEATURE_V7)) { + if (cpu->pmsav7_dregion) { + if (arm_feature(env, ARM_FEATURE_V8)) { + g_free(env->pmsav8.rbar[M_REG_NS]); + g_free(env->pmsav8.rlar[M_REG_NS]); + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + g_free(env->pmsav8.rbar[M_REG_S]); + g_free(env->pmsav8.rlar[M_REG_S]); + } + } else { + g_free(env->pmsav7.drbar); + g_free(env->pmsav7.drsr); + g_free(env->pmsav7.dracr); + } + } + if (cpu->pmsav8r_hdregion) { + g_free(env->pmsav8.hprbar); + g_free(env->pmsav8.hprlar); + } + } + + if (arm_feature(env, ARM_FEATURE_PMU)) { + if (!kvm_enabled()) { + arm_unregister_pre_el_change_hooks(cpu); + arm_unregister_el_change_hooks(cpu); + } + +#ifndef CONFIG_USER_ONLY + if (cpu->pmu_timer) { + timer_del(cpu->pmu_timer); + } +#endif + } + + cpu_remove_sync(CPU(dev)); + acc->parent_unrealize(dev); + +#ifndef CONFIG_USER_ONLY + timer_del(cpu->gt_timer[GTIMER_PHYS]); + timer_del(cpu->gt_timer[GTIMER_VIRT]); + timer_del(cpu->gt_timer[GTIMER_HYP]); + timer_del(cpu->gt_timer[GTIMER_SEC]); + timer_del(cpu->gt_timer[GTIMER_HYPVIRT]); +#endif +} + static ObjectClass *arm_cpu_class_by_name(const char *cpu_model) { ObjectClass *oc; @@ -2422,6 +2534,10 @@ static Property arm_cpu_properties[] = { DEFINE_PROP_UINT64("mp-affinity", ARMCPU, mp_affinity, ARM64_AFFINITY_INVALID), DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID), + DEFINE_PROP_INT32("socket-id", ARMCPU, socket_id, 0), + DEFINE_PROP_INT32("cluster-id", ARMCPU, cluster_id, 0), + DEFINE_PROP_INT32("core-id", ARMCPU, core_id, 0), + DEFINE_PROP_INT32("thread-id", ARMCPU, thread_id, 0), DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1), DEFINE_PROP_END_OF_LIST() }; @@ -2473,6 +2589,12 @@ static const struct TCGCPUOps arm_tcg_ops = { }; #endif /* CONFIG_TCG */ +static int64_t arm_cpu_get_arch_id(CPUState *cs) +{ + ARMCPU *cpu = ARM_CPU(cs); + return cpu->mp_affinity; +} + static void arm_cpu_class_init(ObjectClass *oc, void *data) { ARMCPUClass *acc = ARM_CPU_CLASS(oc); @@ -2482,6 +2604,8 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data) device_class_set_parent_realize(dc, arm_cpu_realizefn, &acc->parent_realize); + device_class_set_parent_unrealize(dc, arm_cpu_unrealizefn, + &acc->parent_unrealize); device_class_set_props(dc, arm_cpu_properties); @@ -2491,6 +2615,7 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data) cc->class_by_name = arm_cpu_class_by_name; cc->has_work = arm_cpu_has_work; cc->dump_state = arm_cpu_dump_state; + cc->get_arch_id = arm_cpu_get_arch_id; cc->set_pc = arm_cpu_set_pc; cc->get_pc = arm_cpu_get_pc; cc->gdb_read_register = arm_cpu_gdb_read_register; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index a0282e0d2817e513db8eb5a159155f52258e6d52..a5ba7f2a2657dbe871b755f4afc2355eb57f5914 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -971,6 +971,9 @@ struct ArchCPU { /* KVM steal time */ OnOffAuto kvm_steal_time; + + /* KVM SVE has been finalized for this CPU */ + bool kvm_sve_finalized; #endif /* CONFIG_KVM */ /* Uniprocessor system with MP extensions */ @@ -1096,6 +1099,10 @@ struct ArchCPU { QLIST_HEAD(, ARMELChangeHook) el_change_hooks; int32_t node_id; /* NUMA node this CPU belongs to */ + int32_t socket_id; + int32_t cluster_id; + int32_t core_id; + int32_t thread_id; /* Used to synchronize KVM and QEMU in-kernel device levels */ uint8_t device_irq_level; @@ -1134,6 +1141,7 @@ struct ARMCPUClass { const ARMCPUInfo *info; DeviceRealize parent_realize; + DeviceUnrealize parent_unrealize; ResettablePhases parent_phases; }; @@ -3355,6 +3363,13 @@ static inline AddressSpace *arm_addressspace(CPUState *cs, MemTxAttrs attrs) */ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque); +/** + * arm_unregister_pre_el_change_hook: + * unregister all pre EL change hook functions. Generally called during + * unrealize'ing leg + */ +void arm_unregister_pre_el_change_hooks(ARMCPU *cpu); + /** * arm_register_el_change_hook: * Register a hook function which will be called immediately after this @@ -3367,6 +3382,12 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, */ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void *opaque); +/** + * arm_unregister_el_change_hook: + * unregister all EL change hook functions. Generally called during + * unrealize'ing leg + */ +void arm_unregister_el_change_hooks(ARMCPU *cpu); /** * arm_rebuild_hflags: diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 1e9c6c85aece12172665e13d9d0f1122250b1dc6..4a1d2daeb6484e5611f922f2fd7652b4c9550e5d 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -110,6 +110,11 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp) */ if (!cpu_isar_feature(aa64_sve, cpu)) { /* SVE is disabled and so are all vector lengths. Good. */ + /* + * SVE is disabled and so are all vector lengths. Good. + * Disable all SVE extensions as well. + */ + cpu->isar.id_aa64zfr0 = 0; return; } @@ -705,6 +710,77 @@ static void aarch64_a53_initfn(Object *obj) define_cortex_a72_a57_a53_cp_reginfo(cpu); } +static void aarch64_a72_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + cpu->dtb_compatible = "arm,cortex-a72"; + cpu->kvm_target = QEMU_KVM_ARM_TARGET_GENERIC_V8; + set_feature(&cpu->env, ARM_FEATURE_V8); + set_feature(&cpu->env, ARM_FEATURE_NEON); + set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER); + set_feature(&cpu->env, ARM_FEATURE_AARCH64); + set_feature(&cpu->env, ARM_FEATURE_CBAR_RO); + set_feature(&cpu->env, ARM_FEATURE_EL2); + set_feature(&cpu->env, ARM_FEATURE_EL3); + set_feature(&cpu->env, ARM_FEATURE_PMU); + cpu->midr = 0x410fd083; + cpu->revidr = 0x00000000; + cpu->reset_fpsid = 0x41034080; + cpu->isar.mvfr0 = 0x10110222; + cpu->isar.mvfr1 = 0x12111111; + cpu->isar.mvfr2 = 0x00000043; + cpu->ctr = 0x8444c004; + cpu->reset_sctlr = 0x00c50838; + cpu->isar.id_pfr0 = 0x00000131; + cpu->isar.id_pfr1 = 0x00011011; + cpu->isar.id_dfr0 = 0x03010066; + cpu->id_afr0 = 0x00000000; + cpu->isar.id_mmfr0 = 0x10201105; + cpu->isar.id_mmfr1 = 0x40000000; + cpu->isar.id_mmfr2 = 0x01260000; + cpu->isar.id_mmfr3 = 0x02102211; + cpu->isar.id_isar0 = 0x02101110; + cpu->isar.id_isar1 = 0x13112111; + cpu->isar.id_isar2 = 0x21232042; + cpu->isar.id_isar3 = 0x01112131; + cpu->isar.id_isar4 = 0x00011142; + cpu->isar.id_isar5 = 0x00011121; + cpu->isar.id_aa64pfr0 = 0x00002222; + cpu->isar.id_aa64dfr0 = 0x10305106; + cpu->isar.id_aa64isar0 = 0x00011120; + cpu->isar.id_aa64mmfr0 = 0x00001124; + cpu->isar.dbgdidr = 0x3516d000; + cpu->clidr = 0x0a200023; + cpu->ccsidr[0] = 0x701fe00a; /* 32KB L1 dcache */ + cpu->ccsidr[1] = 0x201fe012; /* 48KB L1 icache */ + cpu->ccsidr[2] = 0x707fe07a; /* 1MB L2 cache */ + cpu->dcz_blocksize = 4; /* 64 bytes */ + cpu->gic_num_lrs = 4; + cpu->gic_vpribits = 5; + cpu->gic_vprebits = 5; + define_cortex_a72_a57_a53_cp_reginfo(cpu); +} + +static void aarch64_kunpeng_920_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + /* + * Hisilicon Kunpeng-920 CPU is similar to cortex-a72, + * so first initialize cpu data as cortex-a72, + * and then update the special register. + */ + aarch64_a72_initfn(obj); + + cpu->midr = 0x480fd010; + cpu->ctr = 0x84448004; + cpu->isar.id_aa64pfr0 = 0x11001111; + cpu->isar.id_aa64dfr0 = 0x110305408; + cpu->isar.id_aa64isar0 = 0x10211120; + cpu->isar.id_aa64mmfr0 = 0x101125; +} + static void aarch64_host_initfn(Object *obj) { #if defined(CONFIG_KVM) @@ -723,6 +799,34 @@ static void aarch64_host_initfn(Object *obj) #endif } +static void aarch64_max_ft2000plus_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); + } else { + aarch64_a72_initfn(obj); + cpu->midr = 0x70186622; + } + aarch64_add_sve_properties(obj); + aarch64_add_pauth_properties(obj); +} + +static void aarch64_max_tengyun_s2500_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + if (kvm_enabled()) { + kvm_arm_set_cpu_features_from_host(cpu); + } else { + aarch64_a72_initfn(obj); + cpu->midr = 0x70186632; + } + aarch64_add_sve_properties(obj); + aarch64_add_pauth_properties(obj); +} + static void aarch64_max_initfn(Object *obj) { if (kvm_enabled() || hvf_enabled()) { @@ -744,6 +848,9 @@ static void aarch64_max_initfn(Object *obj) static const ARMCPUInfo aarch64_cpus[] = { { .name = "cortex-a57", .initfn = aarch64_a57_initfn }, { .name = "cortex-a53", .initfn = aarch64_a53_initfn }, + { .name = "Kunpeng-920", .initfn = aarch64_kunpeng_920_initfn}, + { .name = "FT-2000+", .initfn = aarch64_max_ft2000plus_initfn }, + { .name = "Tengyun-S2500", .initfn = aarch64_max_tengyun_s2500_initfn }, { .name = "max", .initfn = aarch64_max_initfn }, #if defined(CONFIG_KVM) || defined(CONFIG_HVF) { .name = "host", .initfn = aarch64_host_initfn }, @@ -778,6 +885,18 @@ static void aarch64_cpu_set_aarch64(Object *obj, bool value, Error **errp) } } +static void aarch64_cpu_initfn(Object *obj) +{ + CPUState *cs = CPU(obj); + + /* + * we start every ARM64 vcpu as disabled possible vCPU. It needs to be + * enabled explicitly + */ + cs->disabled = true; + cs->thread_id = 0; +} + static void aarch64_cpu_finalizefn(Object *obj) { } @@ -790,7 +909,9 @@ static const gchar *aarch64_gdb_arch_name(CPUState *cs) static void aarch64_cpu_class_init(ObjectClass *oc, void *data) { CPUClass *cc = CPU_CLASS(oc); + DeviceClass *dc = DEVICE_CLASS(oc); + dc->user_creatable = true; cc->gdb_read_register = aarch64_cpu_gdb_read_register; cc->gdb_write_register = aarch64_cpu_gdb_write_register; cc->gdb_num_core_regs = 34; @@ -836,6 +957,7 @@ void aarch64_cpu_register(const ARMCPUInfo *info) static const TypeInfo aarch64_cpu_type_info = { .name = TYPE_AARCH64_CPU, .parent = TYPE_ARM_CPU, + .instance_init = aarch64_cpu_initfn, .instance_finalize = aarch64_cpu_finalizefn, .abstract = true, .class_init = aarch64_cpu_class_init, diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c index 28f546a5ff9525f9512ac1cd518ebbb2fd06e86f..5ba1e28e346d79e335c6fe22d2581a885b151fef 100644 --- a/target/arm/gdbstub.c +++ b/target/arm/gdbstub.c @@ -553,3 +553,9 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu) } #endif /* CONFIG_TCG */ } + +void arm_cpu_unregister_gdb_regs(ARMCPU *cpu) +{ + CPUState *cs = CPU(cpu); + gdb_unregister_coprocessor_all(cs); +} diff --git a/target/arm/helper.c b/target/arm/helper.c index 2746d3fdac884ab96a7aa07e9ed6d62195cf7630..0370a739e3bbfcf3b04c548190764dd5ac3c4637 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -263,6 +263,19 @@ void init_cpreg_list(ARMCPU *cpu) g_list_free(keys); } +void destroy_cpreg_list(ARMCPU *cpu) +{ + assert(cpu->cpreg_indexes); + assert(cpu->cpreg_values); + assert(cpu->cpreg_vmstate_indexes); + assert(cpu->cpreg_vmstate_values); + + g_free(cpu->cpreg_indexes); + g_free(cpu->cpreg_values); + g_free(cpu->cpreg_vmstate_indexes); + g_free(cpu->cpreg_vmstate_values); +} + /* * Some registers are not accessible from AArch32 EL3 if SCR.NS == 0. */ @@ -1169,13 +1182,21 @@ static bool pmu_counter_enabled(CPUARMState *env, uint8_t counter) bool enabled, prohibited = false, filtered; bool secure = arm_is_secure(env); int el = arm_current_el(env); - uint64_t mdcr_el2 = arm_mdcr_el2_eff(env); - uint8_t hpmn = mdcr_el2 & MDCR_HPMN; + uint64_t mdcr_el2; + uint8_t hpmn; + /* + * We might be called for M-profile cores where MDCR_EL2 doesn't + * exist and arm_mdcr_el2_eff() will assert, so this early-exit check + * must be before we read that value. + */ if (!arm_feature(env, ARM_FEATURE_PMU)) { return false; } + mdcr_el2 = arm_mdcr_el2_eff(env); + hpmn = mdcr_el2 & MDCR_HPMN; + if (!arm_feature(env, ARM_FEATURE_EL2) || (counter < hpmn || counter == 31)) { e = env->cp15.c9_pmcr & PMCRE; @@ -9438,6 +9459,18 @@ void register_cp_regs_for_features(ARMCPU *cpu) #endif } +void unregister_cp_regs_for_features(ARMCPU *cpu) +{ + CPUARMState *env = &cpu->env; + if (arm_feature(env, ARM_FEATURE_M)) { + /* M profile has no coprocessor registers */ + return; + } + + /* empty it all. unregister all the coprocessor registers */ + g_hash_table_remove_all(cpu->cp_regs); +} + /* Sort alphabetically by type name, except for "any". */ static gint arm_cpu_list_compare(gconstpointer a, gconstpointer b) { @@ -10823,6 +10856,24 @@ static void arm_cpu_do_interrupt_aarch32(CPUState *cs) } if (env->exception.target_el == 2) { + /* Debug exceptions are reported differently on AArch32 */ + switch (syn_get_ec(env->exception.syndrome)) { + case EC_BREAKPOINT: + case EC_BREAKPOINT_SAME_EL: + case EC_AA32_BKPT: + case EC_VECTORCATCH: + env->exception.syndrome = syn_insn_abort(arm_current_el(env) == 2, + 0, 0, 0x22); + break; + case EC_WATCHPOINT: + env->exception.syndrome = syn_set_ec(env->exception.syndrome, + EC_DATAABORT); + break; + case EC_WATCHPOINT_SAME_EL: + env->exception.syndrome = syn_set_ec(env->exception.syndrome, + EC_DATAABORT_SAME_EL); + break; + } arm_cpu_do_interrupt_aarch32_hyp(cs); return; } @@ -11321,7 +11372,7 @@ void arm_cpu_do_interrupt(CPUState *cs) env->exception.syndrome); } - if (tcg_enabled() && arm_is_psci_call(cpu, cs->exception_index)) { + if (arm_is_psci_call(cpu, cs->exception_index)) { arm_handle_psci_call(cpu); qemu_log_mask(CPU_LOG_INT, "...handled as PSCI call\n"); return; diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 757e13b0f904e7390d431158342d9350457e583e..d7cc00a084b413fc6df6c7322b37a542b3780be1 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -392,85 +392,85 @@ struct hvf_sreg_match { }; static struct hvf_sreg_match hvf_sreg_match[] = { - { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 14, 0, 7) }, - - { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 4) }, - { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 5) }, - { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 14, 0, 6) }, - { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 14, 0, 7) }, + { HV_SYS_REG_DBGBVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR0_EL1, HVF_SYSREG(0, 0, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR0_EL1, HVF_SYSREG(0, 0, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR1_EL1, HVF_SYSREG(0, 1, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR1_EL1, HVF_SYSREG(0, 1, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR2_EL1, HVF_SYSREG(0, 2, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR2_EL1, HVF_SYSREG(0, 2, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR3_EL1, HVF_SYSREG(0, 3, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR3_EL1, HVF_SYSREG(0, 3, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR4_EL1, HVF_SYSREG(0, 4, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR4_EL1, HVF_SYSREG(0, 4, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR5_EL1, HVF_SYSREG(0, 5, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR5_EL1, HVF_SYSREG(0, 5, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR6_EL1, HVF_SYSREG(0, 6, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR6_EL1, HVF_SYSREG(0, 6, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR7_EL1, HVF_SYSREG(0, 7, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR7_EL1, HVF_SYSREG(0, 7, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR8_EL1, HVF_SYSREG(0, 8, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR8_EL1, HVF_SYSREG(0, 8, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR9_EL1, HVF_SYSREG(0, 9, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR9_EL1, HVF_SYSREG(0, 9, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR10_EL1, HVF_SYSREG(0, 10, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR10_EL1, HVF_SYSREG(0, 10, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR11_EL1, HVF_SYSREG(0, 11, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR11_EL1, HVF_SYSREG(0, 11, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR12_EL1, HVF_SYSREG(0, 12, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR12_EL1, HVF_SYSREG(0, 12, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR13_EL1, HVF_SYSREG(0, 13, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR13_EL1, HVF_SYSREG(0, 13, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR14_EL1, HVF_SYSREG(0, 14, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR14_EL1, HVF_SYSREG(0, 14, 2, 0, 7) }, + + { HV_SYS_REG_DBGBVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 4) }, + { HV_SYS_REG_DBGBCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 5) }, + { HV_SYS_REG_DBGWVR15_EL1, HVF_SYSREG(0, 15, 2, 0, 6) }, + { HV_SYS_REG_DBGWCR15_EL1, HVF_SYSREG(0, 15, 2, 0, 7) }, #ifdef SYNC_NO_RAW_REGS /* @@ -482,7 +482,7 @@ static struct hvf_sreg_match hvf_sreg_match[] = { { HV_SYS_REG_MPIDR_EL1, HVF_SYSREG(0, 0, 3, 0, 5) }, { HV_SYS_REG_ID_AA64PFR0_EL1, HVF_SYSREG(0, 4, 3, 0, 0) }, #endif - { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 2) }, + { HV_SYS_REG_ID_AA64PFR1_EL1, HVF_SYSREG(0, 4, 3, 0, 1) }, { HV_SYS_REG_ID_AA64DFR0_EL1, HVF_SYSREG(0, 5, 3, 0, 0) }, { HV_SYS_REG_ID_AA64DFR1_EL1, HVF_SYSREG(0, 5, 3, 0, 1) }, { HV_SYS_REG_ID_AA64ISAR0_EL1, HVF_SYSREG(0, 6, 3, 0, 0) }, @@ -1272,6 +1272,7 @@ static int hvf_sysreg_read(CPUState *cpu, uint32_t reg, uint32_t rt) /* Call the TCG sysreg handler. This is only safe for GICv3 regs. */ if (!hvf_sysreg_read_cp(cpu, reg, &val)) { hvf_raise_exception(cpu, EXCP_UDEF, syn_uncategorized()); + return 1; } break; case SYSREG_DBGBVR0_EL1: diff --git a/target/arm/internals.h b/target/arm/internals.h index 143d57c0fe46a04d22c8b1fcfa1629b5f7322855..ed9bfb29c8731b028eb77df3860989e8b08d0655 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -187,9 +187,12 @@ void arm_cpu_register(const ARMCPUInfo *info); void aarch64_cpu_register(const ARMCPUInfo *info); void register_cp_regs_for_features(ARMCPU *cpu); +void unregister_cp_regs_for_features(ARMCPU *cpu); void init_cpreg_list(ARMCPU *cpu); +void destroy_cpreg_list(ARMCPU *cpu); void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu); +void arm_cpu_unregister_gdb_regs(ARMCPU *cpu); void arm_translate_init(void); void arm_restore_state_to_opc(CPUState *cs, @@ -311,21 +314,10 @@ vaddr arm_adjust_watchpoint_address(CPUState *cs, vaddr addr, int len); /* Callback function for when a watchpoint or breakpoint triggers. */ void arm_debug_excp_handler(CPUState *cs); -#if defined(CONFIG_USER_ONLY) || !defined(CONFIG_TCG) -static inline bool arm_is_psci_call(ARMCPU *cpu, int excp_type) -{ - return false; -} -static inline void arm_handle_psci_call(ARMCPU *cpu) -{ - g_assert_not_reached(); -} -#else /* Return true if the r0/x0 value indicates that this SMC/HVC is a PSCI call. */ bool arm_is_psci_call(ARMCPU *cpu, int excp_type); /* Actually handle a PSCI call */ void arm_handle_psci_call(ARMCPU *cpu); -#endif /** * arm_clear_exclusive: clear the exclusive monitor @@ -1273,7 +1265,7 @@ FIELD(MTEDESC, TBI, 4, 2) FIELD(MTEDESC, TCMA, 6, 2) FIELD(MTEDESC, WRITE, 8, 1) FIELD(MTEDESC, ALIGN, 9, 3) -FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - 12) /* size - 1 */ +FIELD(MTEDESC, SIZEM1, 12, SIMD_DATA_BITS - SVE_MTEDESC_SHIFT - 12) /* size - 1 */ bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr); uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra); diff --git a/target/arm/kvm-consts.h b/target/arm/kvm-consts.h index 7c6adc14f6e8dc8870316f64a158f1b95d654678..c034823170f46c20465c81ebde89b87a14577347 100644 --- a/target/arm/kvm-consts.h +++ b/target/arm/kvm-consts.h @@ -133,6 +133,8 @@ MISMATCH_CHECK(QEMU_PSCI_RET_DISABLED, PSCI_RET_DISABLED); #define QEMU_KVM_ARM_TARGET_CORTEX_A57 2 #define QEMU_KVM_ARM_TARGET_XGENE_POTENZA 3 #define QEMU_KVM_ARM_TARGET_CORTEX_A53 4 +/* Generic ARM v8 target */ +#define QEMU_KVM_ARM_TARGET_GENERIC_V8 5 /* There's no kernel define for this: sentinel value which * matches no KVM target value for either 64 or 32 bit @@ -144,6 +146,7 @@ MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_FOUNDATION_V8, KVM_ARM_TARGET_FOUNDATION_V8); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A57, KVM_ARM_TARGET_CORTEX_A57); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_XGENE_POTENZA, KVM_ARM_TARGET_XGENE_POTENZA); MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_CORTEX_A53, KVM_ARM_TARGET_CORTEX_A53); +MISMATCH_CHECK(QEMU_KVM_ARM_TARGET_GENERIC_V8, KVM_ARM_TARGET_GENERIC_V8); #define CP_REG_ARM64 0x6000000000000000ULL #define CP_REG_ARM_COPROC_MASK 0x000000000FFF0000 diff --git a/target/arm/kvm-tmm.c b/target/arm/kvm-tmm.c new file mode 100644 index 0000000000000000000000000000000000000000..d18ac1089647fbf035a896c469e6d2901dda3ce7 --- /dev/null +++ b/target/arm/kvm-tmm.c @@ -0,0 +1,441 @@ +/* + * QEMU add virtcca cvm feature. + * + * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "exec/confidential-guest-support.h" +#include "hw/boards.h" +#include "hw/core/cpu.h" +#include "kvm_arm.h" +#include "migration/blocker.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-misc-target.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "sysemu/runstate.h" +#include "hw/loader.h" +#include "linux-headers/asm-arm64/kvm.h" +#include + +#define TYPE_TMM_GUEST "tmm-guest" +OBJECT_DECLARE_SIMPLE_TYPE(TmmGuest, TMM_GUEST) + +#define TMM_PAGE_SIZE qemu_real_host_page_size() +#define TMM_MAX_PMU_CTRS 0x20 +#define TMM_MAX_CFG 6 +#define TMM_MEMORY_INFO_SYSFS "/sys/kernel/tmm/memory_info" + +typedef struct { + uint32_t kae_vf_num; + hwaddr sec_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; + hwaddr hpre_addr[KVM_ARM_TMM_MAX_KAE_VF_NUM]; +} KaeDeviceInfo; + +struct TmmGuest { + ConfidentialGuestSupport parent_obj; + GSList *ram_regions; + TmmGuestMeasurementAlgo measurement_algo; + uint32_t sve_vl; + uint32_t num_pmu_cntrs; + KaeDeviceInfo kae_device_info; +}; + +typedef struct { + hwaddr base1; + hwaddr len1; + hwaddr base2; + hwaddr len2; + bool populate; +} TmmRamRegion; + +static TmmGuest *tmm_guest; + +bool kvm_arm_tmm_enabled(void) +{ + return !!tmm_guest; +} + +static int tmm_configure_one(TmmGuest *guest, uint32_t cfg, Error **errp) +{ + int ret = 1; + const char *cfg_str; + struct kvm_cap_arm_tmm_config_item args = { + .cfg = cfg, + }; + + switch (cfg) { + case KVM_CAP_ARM_TMM_CFG_RPV: + return 0; + case KVM_CAP_ARM_TMM_CFG_HASH_ALGO: + switch (guest->measurement_algo) { + case TMM_GUEST_MEASUREMENT_ALGO_DEFAULT: + return 0; + case TMM_GUEST_MEASUREMENT_ALGO_SHA256: + args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA256; + break; + case TMM_GUEST_MEASUREMENT_ALGO_SHA512: + args.hash_algo = KVM_CAP_ARM_TMM_MEASUREMENT_ALGO_SHA512; + break; + default: + g_assert_not_reached(); + } + cfg_str = "hash algorithm"; + break; + case KVM_CAP_ARM_TMM_CFG_SVE: + if (!guest->sve_vl) { + return 0; + } + args.sve_vq = guest->sve_vl / 128; + cfg_str = "SVE"; + break; + case KVM_CAP_ARM_TMM_CFG_DBG: + return 0; + case KVM_CAP_ARM_TMM_CFG_PMU: + if (!guest->num_pmu_cntrs) { + return 0; + } + args.num_pmu_cntrs = guest->num_pmu_cntrs; + cfg_str = "PMU"; + break; + case KVM_CAP_ARM_TMM_CFG_KAE: + if (!guest->kae_device_info.kae_vf_num) { + return 0; + } + args.kae_vf_num= guest->kae_device_info.kae_vf_num; + for (int i = 0; i < guest->kae_device_info.kae_vf_num; i++) { + args.sec_addr[i] = guest->kae_device_info.sec_addr[i]; + args.hpre_addr[i] = guest->kae_device_info.hpre_addr[i]; + } + cfg_str = "KAE"; + break; + default: + g_assert_not_reached(); + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_CONFIG_CVM, (intptr_t)&args); + if (ret) { + error_setg_errno(errp, -ret, "TMM: failed to configure %s", cfg_str); + } + + return ret; +} + +static gint tmm_compare_ram_regions(gconstpointer a, gconstpointer b) +{ + const TmmRamRegion *ra = a; + const TmmRamRegion *rb = b; + + g_assert(ra->base1 != rb->base1); + return ra->base1 < rb->base1 ? -1 : 1; +} + +void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate) +{ + TmmRamRegion *region; + + region = g_new0(TmmRamRegion, 1); + region->base1 = QEMU_ALIGN_DOWN(base1, TMM_PAGE_SIZE); + region->len1 = QEMU_ALIGN_UP(len1, TMM_PAGE_SIZE); + region->base2 = QEMU_ALIGN_DOWN(base2, TMM_PAGE_SIZE); + region->len2 = QEMU_ALIGN_UP(len2, TMM_PAGE_SIZE); + region->populate = populate; + + tmm_guest->ram_regions = g_slist_insert_sorted(tmm_guest->ram_regions, + region, tmm_compare_ram_regions); +} + +static void tmm_populate_region(gpointer data, gpointer unused) +{ + int ret; + const TmmRamRegion *region = data; + struct kvm_cap_arm_tmm_populate_region_args populate_args = { + .populate_ipa_base1 = region->base1, + .populate_ipa_size1 = region->len1, + .populate_ipa_base2 = region->base2, + .populate_ipa_size2 = region->len2, + .flags = KVM_ARM_TMM_POPULATE_FLAGS_MEASURE, + }; + + if (!region->populate) { + return; + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_POPULATE_CVM, + (intptr_t)&populate_args); + if (ret) { + error_report("TMM: failed to populate cvm region (0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx", 0x%"HWADDR_PRIx"): %s", + region->base1, region->len1, region->base2, region->len2, strerror(-ret)); + exit(1); + } +} + +static int tmm_create_rd(Error **errp) +{ + int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_CREATE_RD); + if (ret) { + error_setg_errno(errp, -ret, "TMM: failed to create tmm Descriptor"); + } + return ret; +} + +static void tmm_vm_state_change(void *opaque, bool running, RunState state) +{ + int ret; + CPUState *cs; + + if (!running) { + return; + } + + g_slist_foreach(tmm_guest->ram_regions, tmm_populate_region, NULL); + g_slist_free_full(g_steal_pointer(&tmm_guest->ram_regions), g_free); + + CPU_FOREACH(cs) { + ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_TEC); + if (ret) { + error_report("TMM: failed to finalize vCPU: %s", strerror(-ret)); + exit(1); + } + } + + ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_ARM_TMM, 0, + KVM_CAP_ARM_TMM_ACTIVATE_CVM); + if (ret) { + error_report("TMM: failed to activate cvm: %s", strerror(-ret)); + exit(1); + } +} + +int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + int ret; + int cfg; + + if (!tmm_guest) { + return -ENODEV; + } + + if (!kvm_check_extension(kvm_state, KVM_CAP_ARM_TMM)) { + error_setg(errp, "KVM does not support TMM"); + return -ENODEV; + } + + for (cfg = 0; cfg < TMM_MAX_CFG; cfg++) { + ret = tmm_configure_one(tmm_guest, cfg, &error_abort); + if (ret) { + return ret; + } + } + + ret = tmm_create_rd(&error_abort); + if (ret) { + return ret; + } + + qemu_add_vm_change_state_handler(tmm_vm_state_change, NULL); + return 0; +} + +static void tmm_get_sve_vl(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->sve_vl, errp); +} + +static void tmm_set_sve_vl(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value & 0x7f || value >= ARM_MAX_VQ * 128) { + error_setg(errp, "invalid SVE vector length"); + return; + } + + guest->sve_vl = value; +} + +static void tmm_get_num_pmu_cntrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->num_pmu_cntrs, errp); +} + +static void tmm_set_num_pmu_cntrs(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value >= TMM_MAX_PMU_CTRS) { + error_setg(errp, "invalid number of PMU counters"); + return; + } + + guest->num_pmu_cntrs = value; +} + +static int tmm_get_measurement_algo(Object *obj, Error **errp G_GNUC_UNUSED) +{ + TmmGuest *guest = TMM_GUEST(obj); + + return guest->measurement_algo; +} + +static void tmm_set_measurement_algo(Object *obj, int algo, Error **errp G_GNUC_UNUSED) +{ + TmmGuest *guest = TMM_GUEST(obj); + + guest->measurement_algo = algo; +} + +static void tmm_get_kae_vf_num(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + + visit_type_uint32(v, name, &guest->kae_device_info.kae_vf_num, errp); +} + +static void tmm_set_kae_vf_num(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + TmmGuest *guest = TMM_GUEST(obj); + uint32_t value; + + if (!visit_type_uint32(v, name, &value, errp)) { + return; + } + + if (value > KVM_ARM_TMM_MAX_KAE_VF_NUM) { + error_setg(errp, "invalid number of kae vfs"); + return; + } + + guest->kae_device_info.kae_vf_num = value; +} + +int tmm_get_kae_num(void) +{ + return tmm_guest->kae_device_info.kae_vf_num; +} + +void tmm_set_sec_addr(hwaddr base, int num) +{ + tmm_guest->kae_device_info.sec_addr[num] = base; +} + +void tmm_set_hpre_addr(hwaddr base, int num) +{ + tmm_guest->kae_device_info.hpre_addr[num] = base; +} + +static void tmm_guest_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_enum(oc, "measurement-algo", + "TmmGuestMeasurementAlgo", + &TmmGuestMeasurementAlgo_lookup, + tmm_get_measurement_algo, + tmm_set_measurement_algo); + object_class_property_set_description(oc, "measurement-algo", + "cvm measurement algorithm ('sha256', 'sha512')"); + /* + * This is not ideal. Normally SVE parameters are given to -cpu, but the + * cvm parameters are needed much earlier than CPU initialization. We also + * don't have a way to discover what is supported at the moment, the idea is + * that the user knows exactly what hardware it is running on because these + * parameters are part of the measurement and play in the attestation. + */ + object_class_property_add(oc, "sve-vector-length", "uint32", tmm_get_sve_vl, + tmm_set_sve_vl, NULL, NULL); + object_class_property_set_description(oc, "sve-vector-length", + "SVE vector length. 0 disables SVE (the default)"); + object_class_property_add(oc, "num-pmu-counters", "uint32", + tmm_get_num_pmu_cntrs, tmm_set_num_pmu_cntrs, + NULL, NULL); + object_class_property_set_description(oc, "num-pmu-counters", + "Number of PMU counters"); + object_class_property_add(oc, "kae", "uint32", tmm_get_kae_vf_num, + tmm_set_kae_vf_num, NULL, NULL); + object_class_property_set_description(oc, "kae", + "Number of KAE virtual functions. 0 disables KAE (the default)"); +} + +static void tmm_guest_instance_init(Object *obj) +{ + if (tmm_guest) { + error_report("a single instance of TmmGuest is supported"); + exit(1); + } + tmm_guest = TMM_GUEST(obj); +} + +static const TypeInfo tmm_guest_info = { + .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, + .name = TYPE_TMM_GUEST, + .instance_size = sizeof(struct TmmGuest), + .instance_init = tmm_guest_instance_init, + .class_init = tmm_guest_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void tmm_register_types(void) +{ + type_register_static(&tmm_guest_info); +} +type_init(tmm_register_types); + +static VirtccaCapability *virtcca_get_capabilities(Error **errp) +{ + VirtccaCapability *cap = NULL; + uint64_t tmi_version = 0; + int rc = 0; + + if (kvm_ioctl(kvm_state, KVM_GET_TMI_VERSION, &tmi_version) < 0) { + error_setg(errp, "VIRTCCA is not enabled in KVM"); + return NULL; + } + + rc = access(TMM_MEMORY_INFO_SYSFS, R_OK); + if (rc < 0) { + error_setg_errno(errp, errno, "VIRTCCA: Failed to read %s", + TMM_MEMORY_INFO_SYSFS); + return NULL; + } + + cap = g_new0(VirtccaCapability, 1); + + cap->enabled = true; + + return cap; +} + +VirtccaCapability *qmp_query_virtcca_capabilities(Error **errp) +{ + return virtcca_get_capabilities(errp); +} \ No newline at end of file diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 7903e2ddde1b70bbd0db7f27992afe86763899d5..ab31515a2af6500374ec9676117c162e13692365 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -26,6 +26,8 @@ #include "trace.h" #include "internals.h" #include "hw/pci/pci.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" #include "exec/memattrs.h" #include "exec/address-spaces.h" #include "hw/boards.h" @@ -255,9 +257,27 @@ int kvm_arch_get_default_type(MachineState *ms) return fixed_ipa ? 0 : size; } +static void kvm_update_ipiv_cap(KVMState *s) +{ + int ret; + + if (!kvm_check_extension(s, KVM_CAP_ARM_HISI_IPIV)) { + return; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_ARM_HISI_IPIV, 0); + if (ret) { + fprintf(stderr, "Could not enable KVM_CAP_ARM_HISI_IPIV: %d\n", ret); + } + + return; +} + int kvm_arch_init(MachineState *ms, KVMState *s) { + MachineClass *mc = MACHINE_GET_CLASS(ms); int ret = 0; + /* For ARM interrupt delivery is always asynchronous, * whether we are using an in-kernel VGIC or not. */ @@ -308,7 +328,25 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + /* + * To be able to handle PSCI CPU ON calls in QEMU, we need to install SMCCC + * filter in the Host KVM. This is required to support features like + * virtual CPU Hotplug on ARM platforms. + */ + if (mc->has_hotpluggable_cpus && ms->smp.max_cpus > ms->smp.cpus) { + if (kvm_arm_set_smccc_filter(PSCI_0_2_FN64_CPU_ON, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU On PSCI-to-user-space fwd filter install failed"); + } else if (kvm_arm_set_smccc_filter(PSCI_0_2_FN_CPU_OFF, + KVM_SMCCC_FILTER_FWD_TO_USER)) { + error_report("CPU Off PSCI-to-user-space fwd filter install failed"); + } else { + s->kvm_smccc_filter_enabled = true; + } + } + kvm_arm_init_debug(s); + kvm_update_ipiv_cap(s); return ret; } @@ -592,6 +630,10 @@ bool write_list_to_kvmstate(ARMCPU *cpu, int level) continue; } + if (virtcca_cvm_enabled() && regidx == KVM_REG_ARM_TIMER_CNT) { + continue; + } + switch (regidx & KVM_REG_SIZE_MASK) { case KVM_REG_SIZE_U32: v32 = cpu->cpreg_values[i]; @@ -634,11 +676,12 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu) void kvm_arm_reset_vcpu(ARMCPU *cpu) { int ret; + CPUState *cs = CPU(cpu); /* Re-init VCPU so that all registers are set to * their respective reset values. */ - ret = kvm_arm_vcpu_init(CPU(cpu)); + ret = kvm_arm_vcpu_init(cs); if (ret < 0) { fprintf(stderr, "kvm_arm_vcpu_init failed: %s\n", strerror(-ret)); abort(); @@ -655,6 +698,45 @@ void kvm_arm_reset_vcpu(ARMCPU *cpu) * for the same reason we do so in kvm_arch_get_registers(). */ write_list_to_cpustate(cpu); + + /* + * Ensure we call kvm_arch_put_registers(). The vCPU isn't marked dirty if + * it was parked in KVM and is now booting from a PSCI CPU_ON call. + */ + cs->vcpu_dirty = true; +} + +void kvm_arm_create_host_vcpu(ARMCPU *cpu) +{ + CPUState *cs = CPU(cpu); + unsigned long vcpu_id = cs->cpu_index; + int ret; + + ret = kvm_create_vcpu(cs); + if (ret < 0) { + error_report("Failed to create host vcpu %ld", vcpu_id); + abort(); + } + + /* + * Initialize the vCPU in the host. This will reset the sys regs + * for this vCPU and related registers like MPIDR_EL1 etc. also + * gets programmed during this call to host. These are referred + * later while setting device attributes of the GICR during GICv3 + * reset + */ + arm_cpu_finalize_features(cpu, &error_abort); + ret = kvm_arch_init_vcpu(cs); + if (ret < 0) { + error_report("Failed to initialize host vcpu %ld", vcpu_id); + abort(); + } + + /* + * park the created vCPU. shall be used during kvm_get_vcpu() when + * threads are created during realization of ARM vCPUs. + */ + kvm_park_vcpu(cs); } /* @@ -925,6 +1007,38 @@ static int kvm_arm_handle_dabt_nisv(CPUState *cs, uint64_t esr_iss, return -1; } +static int kvm_arm_handle_hypercall(CPUState *cs, struct kvm_run *run) +{ + ARMCPU *cpu = ARM_CPU(cs); + CPUARMState *env = &cpu->env; + + kvm_cpu_synchronize_state(cs); + + /* + * hard coding immediate to 0 as we dont expect non-zero value as of now + * This might change in future versions. Hence, KVM_GET_ONE_REG could be + * used in such cases but it must be enhanced then only synchronize will + * also fetch ESR_EL2 value. + */ + if (run->hypercall.flags == KVM_HYPERCALL_EXIT_SMC) { + cs->exception_index = EXCP_SMC; + env->exception.syndrome = syn_aa64_smc(0); + } else { + cs->exception_index = EXCP_HVC; + env->exception.syndrome = syn_aa64_hvc(0); + } + env->exception.target_el = 1; + qemu_mutex_lock_iothread(); + arm_cpu_do_interrupt(cs); + qemu_mutex_unlock_iothread(); + + /* + * For PSCI, exit the kvm_run loop and process the work. Especially + * important if this was a CPU_OFF command and we can't return to the guest. + */ + return EXCP_INTERRUPT; +} + int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) { int ret = 0; @@ -940,6 +1054,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = kvm_arm_handle_dabt_nisv(cs, run->arm_nisv.esr_iss, run->arm_nisv.fault_ipa); break; + case KVM_EXIT_HYPERCALL: + ret = kvm_arm_handle_hypercall(cs, run); + break; default: qemu_log_mask(LOG_UNIMP, "%s: un-handled exit reason %d\n", __func__, run->exit_reason); @@ -1053,6 +1170,51 @@ int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, return 0; } +int kvm_create_shadow_device(PCIDevice *dev) +{ + KVMState *s = kvm_state; + struct kvm_master_dev_info *mdi; + MSIMessage msg; + uint32_t vector, nvectors = msix_nr_vectors_allocated(dev); + uint32_t request_id; + int ret; + + if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) { + return 0; + } + + mdi = g_malloc0(sizeof(uint32_t) + sizeof(struct kvm_msi) * nvectors); + mdi->nvectors = nvectors; + request_id = pci_requester_id(dev); + + for (vector = 0; vector < nvectors; vector++) { + msg = msix_get_message(dev, vector); + mdi->msi[vector].address_lo = extract64(msg.address, 0, 32); + mdi->msi[vector].address_hi = extract64(msg.address, 32, 32); + mdi->msi[vector].data = le32_to_cpu(msg.data); + mdi->msi[vector].flags = KVM_MSI_VALID_DEVID; + mdi->msi[vector].devid = request_id; + memset(mdi->msi[vector].pad, 0, sizeof(mdi->msi[vector].pad)); + } + + ret = kvm_vm_ioctl(s, KVM_CREATE_SHADOW_DEV, mdi); + g_free(mdi); + return ret; +} + +int kvm_delete_shadow_device(PCIDevice *dev) +{ + KVMState *s = kvm_state; + uint32_t request_id, nvectors = msix_nr_vectors_allocated(dev); + + if (!kvm_vm_check_extension(s, KVM_CAP_ARM_VIRT_MSI_BYPASS) || !nvectors) { + return 0; + } + + request_id = pci_requester_id(dev); + return kvm_vm_ioctl(s, KVM_DEL_SHADOW_DEV, &request_id); +} + int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, int vector, PCIDevice *dev) { @@ -1071,7 +1233,7 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) bool kvm_arch_cpu_check_are_resettable(void) { - return true; + return !virtcca_cvm_enabled(); } static void kvm_arch_get_eager_split_size(Object *obj, Visitor *v, diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 3c175c93a7a8f29a7a157c9bf059db601f546b7f..b099287ed0645d466604456236171c6b1dc3dd52 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -111,6 +111,25 @@ bool kvm_arm_hw_debug_active(CPUState *cs) return ((cur_hw_wps > 0) || (cur_hw_bps > 0)); } +static bool kvm_arm_set_vm_attr(struct kvm_device_attr *attr, const char *name) +{ + int err; + + err = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, attr); + if (err != 0) { + error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err)); + return false; + } + + err = kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ATTR, attr); + if (err != 0) { + error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err)); + return false; + } + + return true; +} + static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr, const char *name) { @@ -181,6 +200,28 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa) } } +int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction) +{ + struct kvm_smccc_filter filter = { + .base = func, + .nr_functions = 1, + .action = faction, + }; + struct kvm_device_attr attr = { + .group = KVM_ARM_VM_SMCCC_CTRL, + .attr = KVM_ARM_VM_SMCCC_FILTER, + .flags = 0, + .addr = (uintptr_t)&filter, + }; + + if (!kvm_arm_set_vm_attr(&attr, "SMCCC Filter")) { + error_report("failed to set SMCCC filter in KVM Host"); + return -1; + } + + return 0; +} + static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id) { uint64_t ret; @@ -543,6 +584,11 @@ static int kvm_arm_sve_set_vls(CPUState *cs) assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX); + if (virtcca_cvm_enabled()) { + /* Already set through tmm config */ + return 0; + } + return kvm_set_one_reg(cs, KVM_REG_ARM64_SVE_VLS, &vls[0]); } @@ -562,7 +608,14 @@ int kvm_arch_init_vcpu(CPUState *cs) return -EINVAL; } - qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs); + /* + * Install VM change handler only when vCPU thread has been spawned + * i.e. vCPU is being realized + */ + if (cs->thread_id) { + cs->vmcse = qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, + cs); + } /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); @@ -599,7 +652,7 @@ int kvm_arch_init_vcpu(CPUState *cs) return ret; } - if (cpu_isar_feature(aa64_sve, cpu)) { + if (cpu_isar_feature(aa64_sve, cpu) && !cpu->kvm_sve_finalized) { ret = kvm_arm_sve_set_vls(cs); if (ret) { return ret; @@ -622,9 +675,8 @@ int kvm_arch_init_vcpu(CPUState *cs) } /* - * When KVM is in use, PSCI is emulated in-kernel and not by qemu. - * Currently KVM has its own idea about MPIDR assignment, so we - * override our defaults with what we get from KVM. + * KVM may emulate PSCI in-kernel. Currently KVM has its own idea about + * MPIDR assignment, so we override our defaults with what we get from KVM. */ ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr); if (ret) { @@ -640,6 +692,10 @@ int kvm_arch_init_vcpu(CPUState *cs) int kvm_arch_destroy_vcpu(CPUState *cs) { + if (cs->thread_id) { + qemu_del_vm_change_state_handler(cs->vmcse); + } + return 0; } diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 051a0da41c4929c178134f2b2a67622e84ef42b5..a29d4548f4ce39a767ac59c694beda3ed7b61506 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -73,6 +73,8 @@ int kvm_arm_vcpu_finalize(CPUState *cs, int feature); void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group, uint64_t attr, int dev_fd, uint64_t addr_ormask); +void virtcca_kvm_get_mmio_addr(hwaddr *mmio_start, hwaddr *mmio_size); + /** * kvm_arm_init_cpreg_list: * @cpu: ARMCPU @@ -163,6 +165,17 @@ void kvm_arm_cpu_post_load(ARMCPU *cpu); */ void kvm_arm_reset_vcpu(ARMCPU *cpu); +/** + * kvm_arm_create_host_vcpu: + * @cpu: ARMCPU + * + * Called at to pre create all possible kvm vCPUs within the the host at the + * virt machine init time. This will also init this pre-created vCPU and + * hence result in vCPU reset at host. These pre created and inited vCPUs + * shall be parked for use when ARM vCPUs are actually realized. + */ +void kvm_arm_create_host_vcpu(ARMCPU *cpu); + /** * kvm_arm_init_serror_injection: * @cs: CPUState @@ -377,6 +390,24 @@ void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa); int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level); +void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, hwaddr len2, bool populate); + +int tmm_get_kae_num(void); +void tmm_set_sec_addr(hwaddr base, int num); +void tmm_set_hpre_addr(hwaddr base, int num); + +int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp); +bool kvm_arm_tmm_enabled(void); + +/** + * kvm_arm_set_smccc_filter + * @func: funcion + * @faction: SMCCC filter action(handle, deny, fwd-to-user) to be deployed + * + * Sets the ARMs SMC-CC filter in KVM Host for selective hypercall exits + */ +int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction); + #else /* @@ -451,6 +482,36 @@ static inline uint32_t kvm_arm_sve_get_vls(CPUState *cs) g_assert_not_reached(); } +static inline int kvm_arm_set_smccc_filter(uint64_t func, uint8_t faction) +{ + g_assert_not_reached(); +} + +static inline int kvm_arm_tmm_init(ConfidentialGuestSupport *cgs, Error **errp G_GNUC_UNUSED) +{ + g_assert_not_reached(); +} + +static inline void tmm_add_ram_region(hwaddr base1, hwaddr len1, hwaddr base2, + hwaddr len2, bool populate) +{ + g_assert_not_reached(); +} + +static inline void tmm_set_sec_addr(hwaddr base, int num) +{ + g_assert_not_reached(); +} + +static inline void tmm_set_hpre_addr(hwaddr base, int num) +{ + g_assert_not_reached(); +} + +static inline int tmm_get_kae_num(void) +{ + g_assert_not_reached(); +} #endif /** diff --git a/target/arm/meson.build b/target/arm/meson.build index 5d04a8e94f2ebace7dd56ee3c92923b4e1b8d55d..389ee5465882e31718e27dedd878036eeae2ea8a 100644 --- a/target/arm/meson.build +++ b/target/arm/meson.build @@ -10,6 +10,7 @@ arm_ss.add(zlib) arm_ss.add(when: 'CONFIG_KVM', if_true: files('hyp_gdbstub.c', 'kvm.c', 'kvm64.c'), if_false: files('kvm-stub.c')) arm_ss.add(when: 'CONFIG_HVF', if_true: files('hyp_gdbstub.c')) +arm_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c', 'kvm64.c', 'kvm-tmm.c'), if_false: files('kvm-stub.c')) arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'cpu64.c', @@ -23,6 +24,7 @@ arm_system_ss.add(files( 'arm-qmp-cmds.c', 'cortex-regs.c', 'machine.c', + 'psci.c', 'ptw.c', )) diff --git a/target/arm/tcg/psci.c b/target/arm/psci.c similarity index 97% rename from target/arm/tcg/psci.c rename to target/arm/psci.c index 6c1239bb9685a00ef7d9ad883ba4c1a0e87ada8b..a8690a16afc53c314411959fe9ebd81bd4ba4652 100644 --- a/target/arm/tcg/psci.c +++ b/target/arm/psci.c @@ -21,7 +21,9 @@ #include "exec/helper-proto.h" #include "kvm-consts.h" #include "qemu/main-loop.h" +#include "qemu/error-report.h" #include "sysemu/runstate.h" +#include "sysemu/tcg.h" #include "internals.h" #include "arm-powerctl.h" @@ -157,6 +159,11 @@ void arm_handle_psci_call(ARMCPU *cpu) case QEMU_PSCI_0_1_FN_CPU_SUSPEND: case QEMU_PSCI_0_2_FN_CPU_SUSPEND: case QEMU_PSCI_0_2_FN64_CPU_SUSPEND: + if (!tcg_enabled()) { + warn_report("CPU suspend not supported in non-tcg mode"); + break; + } +#ifdef CONFIG_TCG /* Affinity levels are not supported in QEMU */ if (param[1] & 0xfffe0000) { ret = QEMU_PSCI_RET_INVALID_PARAMS; @@ -169,6 +176,7 @@ void arm_handle_psci_call(ARMCPU *cpu) env->regs[0] = 0; } helper_wfi(env, 4); +#endif break; case QEMU_PSCI_1_0_FN_PSCI_FEATURES: switch (param[1]) { diff --git a/target/arm/syndrome.h b/target/arm/syndrome.h index 95454b5b3bb01be2800f0933da43635c9f6fcde5..eccb759da6bb982f532b220867fd0898d2b16c92 100644 --- a/target/arm/syndrome.h +++ b/target/arm/syndrome.h @@ -25,6 +25,8 @@ #ifndef TARGET_ARM_SYNDROME_H #define TARGET_ARM_SYNDROME_H +#include "qemu/bitops.h" + /* Valid Syndrome Register EC field values */ enum arm_exception_class { EC_UNCATEGORIZED = 0x00, @@ -80,6 +82,7 @@ typedef enum { SME_ET_InactiveZA, } SMEExceptionType; +#define ARM_EL_EC_LENGTH 6 #define ARM_EL_EC_SHIFT 26 #define ARM_EL_IL_SHIFT 25 #define ARM_EL_ISV_SHIFT 24 @@ -91,6 +94,11 @@ static inline uint32_t syn_get_ec(uint32_t syn) return syn >> ARM_EL_EC_SHIFT; } +static inline uint32_t syn_set_ec(uint32_t syn, uint32_t ec) +{ + return deposit32(syn, ARM_EL_EC_SHIFT, ARM_EL_EC_LENGTH, ec); +} + /* * Utility functions for constructing various kinds of syndrome value. * Note that in general we follow the AArch64 syndrome values; in a diff --git a/target/arm/tcg/meson.build b/target/arm/tcg/meson.build index 6fca38f2ccb1d37ecdd15ba2a082b744231828da..ad3cfcb3bda90eede7a8bed922c38c04f388fdce 100644 --- a/target/arm/tcg/meson.build +++ b/target/arm/tcg/meson.build @@ -51,7 +51,3 @@ arm_ss.add(when: 'TARGET_AARCH64', if_true: files( 'sme_helper.c', 'sve_helper.c', )) - -arm_system_ss.add(files( - 'psci.c', -)) diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 1ee2690ceb5a293fdc23146b1a9b294b2b2c49a5..ae4f39ed02f13b641f5e1ceb4c83d313d9209d89 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -573,8 +573,8 @@ void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -750,8 +750,8 @@ void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -916,7 +916,7 @@ void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn, if (pb & 1) { uint32_t *a = vza_row + H1_4(col); uint32_t *m = vzm + H1_4(col); - *a = float32_muladd(n, *m, *a, 0, vst); + *a = float32_muladd(n, *m, *a, 0, &fpst); } col += 4; pb >>= 4; @@ -1134,10 +1134,10 @@ static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \ uint64_t sum = 0; \ /* Apply P to N as a mask, making the inactive elements 0. */ \ n &= expand_pred_h(p); \ - sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ - sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ - sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ - sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ + sum += (int64_t)(NTYPE)(n >> 0) * (MTYPE)(m >> 0); \ + sum += (int64_t)(NTYPE)(n >> 16) * (MTYPE)(m >> 16); \ + sum += (int64_t)(NTYPE)(n >> 32) * (MTYPE)(m >> 32); \ + sum += (int64_t)(NTYPE)(n >> 48) * (MTYPE)(m >> 48); \ return neg ? a - sum : a + sum; \ } diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index f006d152cc9dc8581cbf94918e7e8768d5b6e341..9694201550aa69251cf704aa8e7fe71fc469dba0 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -5800,8 +5800,8 @@ void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -6156,8 +6156,8 @@ void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } @@ -6306,9 +6306,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, flags = info.page[0].flags | info.page[1].flags; if (unlikely(flags != 0)) { -#ifdef CONFIG_USER_ONLY - g_assert_not_reached(); -#else /* * At least one page includes MMIO. * Any bus operation can fail with cpu_transaction_failed, @@ -6339,7 +6336,6 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, } while (reg_off & 63); } while (reg_off <= reg_last); return; -#endif } mem_off = info.mem_off_first[0]; @@ -6410,8 +6406,8 @@ void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); /* Perform gross MTE suppression early. */ - if (!tbi_check(desc, bit55) || - tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + if (!tbi_check(mtedesc, bit55) || + tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { mtedesc = 0; } diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index a2e49c39f9f3caf17b1bc1fdbba6018121962777..5beac07b602a26e8b655097d6e59702f6b1119af 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -3306,7 +3306,7 @@ static bool trans_LDAPR(DisasContext *s, arg_LDAPR *a) if (a->rn == 31) { gen_check_sp_alignment(s); } - mop = check_atomic_align(s, a->rn, a->sz); + mop = check_ordered_align(s, a->rn, 0, false, a->sz); clean_addr = gen_mte_check1(s, cpu_reg_sp(s, a->rn), false, a->rn != 31, mop); /* @@ -8221,7 +8221,7 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q, narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd); tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed); if (i == 0) { - tcg_gen_mov_i64(tcg_final, tcg_rd); + tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize); } else { tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); } @@ -10141,6 +10141,7 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u, tcg_gen_shli_i64(tcg_rd, tcg_rd, shift); write_vec_element(s, tcg_rd, rd, i, size + 1); } + clear_vec_high(s, true, rd); } /* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */ diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c index 8f0dfc884ec6a0c5dd5ae213649bb18e04b37839..1e89516736612042f037bdc4011c20d33d7cfa66 100644 --- a/target/arm/tcg/translate-sme.c +++ b/target/arm/tcg/translate-sme.c @@ -49,7 +49,15 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, int rs, /* Prepare a power-of-two modulo via extraction of @len bits. */ len = ctz32(streaming_vec_reg_size(s)) - esz; - if (vertical) { + if (!len) { + /* + * SVL is 128 and the element size is 128. There is exactly + * one 128x128 tile in the ZA storage, and so we calculate + * (Rs + imm) MOD 1, which is always 0. We need to special case + * this because TCG doesn't allow deposit ops with len 0. + */ + tcg_gen_movi_i32(tmp, 0); + } else if (vertical) { /* * Compute the byte offset of the index within the tile: * (index % (svl / size)) * size diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index 296e7d1ce223b6c7e33a6dbe6519d2d17002fd17..1b722ae75d2d8c51b6dec3d207da6fc67c27cd52 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -50,13 +50,27 @@ static int tszimm_esz(DisasContext *s, int x) static int tszimm_shr(DisasContext *s, int x) { - return (16 << tszimm_esz(s, x)) - x; + /* + * We won't use the tszimm_shr() value if tszimm_esz() returns -1 (the + * trans function will check for esz < 0), so we can return any + * value we like from here in that case as long as we avoid UB. + */ + int esz = tszimm_esz(s, x); + if (esz < 0) { + return esz; + } + return (16 << esz) - x; } /* See e.g. LSL (immediate, predicated). */ static int tszimm_shl(DisasContext *s, int x) { - return x - (8 << tszimm_esz(s, x)); + /* As with tszimm_shr(), value will be unused if esz < 0 */ + int esz = tszimm_esz(s, x); + if (esz < 0) { + return esz; + } + return x - (8 << esz); } /* The SH bit is in bit 8. Extract the low 8 and shift. */ @@ -4443,26 +4457,28 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, { unsigned vsz = vec_full_reg_size(s); TCGv_ptr t_pg; + uint32_t sizem1; int desc = 0; - /* - * For e.g. LD4, there are not enough arguments to pass all 4 - * registers as pointers, so encode the regno into the data field. - * For consistency, do this even for LD1. - */ + assert(mte_n >= 1 && mte_n <= 4); + sizem1 = (mte_n << dtype_msz(dtype)) - 1; + assert(sizem1 <= R_MTEDESC_SIZEM1_MASK >> R_MTEDESC_SIZEM1_SHIFT); if (s->mte_active[0]) { - int msz = dtype_msz(dtype); - desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); - desc = FIELD_DP32(desc, MTEDESC, SIZEM1, (mte_n << msz) - 1); + desc = FIELD_DP32(desc, MTEDESC, SIZEM1, sizem1); desc <<= SVE_MTEDESC_SHIFT; } else { addr = clean_data_tbi(s, addr); } + /* + * For e.g. LD4, there are not enough arguments to pass all 4 + * registers as pointers, so encode the regno into the data field. + * For consistency, do this even for LD1. + */ desc = simd_desc(vsz, vsz, zt | desc); t_pg = tcg_temp_new_ptr(); @@ -4600,7 +4616,7 @@ static void do_ld_zpa(DisasContext *s, int zt, int pg, * accessible via the instruction encoding. */ assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn); + do_mem_zpa(s, zt, pg, addr, dtype, nreg + 1, false, fn); } static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) @@ -5168,14 +5184,13 @@ static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, if (nreg == 0) { /* ST1 */ fn = fn_single[s->mte_active[0]][be][msz][esz]; - nreg = 1; } else { /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ assert(msz == esz); fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz]; } assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn); + do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg + 1, true, fn); } static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index b3660173d1d07aa7c765a9d8e7789ccc3bcc114d..e555e885a13edf0deb39ac9ffc86b57c6a9e5a26 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -4584,7 +4584,7 @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64, tcg_gen_andi_i32(t, t, 1u << maskbit); tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label); - gen_exception_insn(s, 0, EXCP_UDEF, syndrome); + gen_exception_insn_el(s, 0, EXCP_UDEF, syndrome, 2); /* * gen_exception_insn() will set is_jmp to DISAS_NORETURN, * but since we're conditionally branching over it, we want diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 1f93510b85cd352f093552b2897a87d6f7e98ea3..83b49ef0092cb9fa78e386565950c2bb377dd1a5 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -692,6 +692,13 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ { \ intptr_t i = 0, opr_sz = simd_oprsz(desc); \ intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ + /* \ + * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ + * first iteration might not be a full 16 byte segment. But \ + * for vector lengths beyond that this must be SVE and we know \ + * opr_sz is a multiple of 16, so we need not clamp segend \ + * to opr_sz_n when we advance it at the end of the loop. \ + */ \ intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ intptr_t index = simd_data(desc); \ TYPED *d = vd, *a = va; \ @@ -709,7 +716,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ n[i * 4 + 2] * m2 + \ n[i * 4 + 3] * m3); \ } while (++i < segend); \ - segend = i + 4; \ + segend = i + (16 / sizeof(TYPED)); \ } while (i < opr_sz_n); \ clear_tail(d, opr_sz, simd_maxsz(desc)); \ } @@ -843,7 +850,7 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; intptr_t elements = opr_sz / sizeof(float16); - intptr_t eltspersegment = 16 / sizeof(float16); + intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ @@ -905,7 +912,7 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; intptr_t elements = opr_sz / sizeof(float32); - intptr_t eltspersegment = 16 / sizeof(float32); + intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 3e5e37abbe8787652fa04bc8e968a08f39ec7426..ff59bc552228339cb2f0baececf0e18b7b6856a0 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -1121,8 +1121,8 @@ const FloatRoundMode arm_rmode_to_sf_map[] = { uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus) { float_status *status = vstatus; - uint32_t inexact, frac; - uint32_t e_old, e_new; + uint32_t frac, e_old, e_new; + bool inexact; e_old = get_float_exception_flags(status); set_float_exception_flags(0, status); @@ -1130,13 +1130,13 @@ uint64_t HELPER(fjcvtzs)(float64 value, void *vstatus) e_new = get_float_exception_flags(status); set_float_exception_flags(e_old | e_new, status); - if (value == float64_chs(float64_zero)) { - /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */ - inexact = 1; - } else { - /* Normal inexact or overflow or NaN */ - inexact = e_new & (float_flag_inexact | float_flag_invalid); - } + /* Normal inexact, denormal with flush-to-zero, or overflow or NaN */ + inexact = e_new & (float_flag_inexact | + float_flag_input_denormal | + float_flag_invalid); + + /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */ + inexact |= value == float64_chs(float64_zero); /* Pack the result and the env->ZF representation of Z together. */ return deposit64(frac, 32, 32, inexact); diff --git a/target/hexagon/idef-parser/parser-helpers.c b/target/hexagon/idef-parser/parser-helpers.c index 4af020933aaa3809ce6e23b56c1893df38bd8b6d..a83099de6b4959275d5b8b2ffd5de5706d3fbae7 100644 --- a/target/hexagon/idef-parser/parser-helpers.c +++ b/target/hexagon/idef-parser/parser-helpers.c @@ -2123,9 +2123,16 @@ void free_instruction(Context *c) g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE); } g_array_free(c->inst.strings, TRUE); + /* + * Free list of arguments that might need initialization, if they haven't + * already been freed. + */ + if (c->inst.init_list) { + g_array_free(c->inst.init_list, TRUE); + } /* Free INAME token value */ g_string_free(c->inst.name, TRUE); - /* Free variables and registers */ + /* Free declared TCGv variables */ g_array_free(c->inst.allocated, TRUE); /* Initialize instruction-specific portion of the context */ memset(&(c->inst), 0, sizeof(Inst)); diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build index da8e608d0065defad0f18d437e57783286c313cd..436217f25a13b295f7ab0940c5cd7bbed0051efa 100644 --- a/target/hexagon/meson.build +++ b/target/hexagon/meson.build @@ -188,7 +188,7 @@ if idef_parser_enabled and 'hexagon-linux-user' in target_dirs arguments: ['@INPUT@', '--defines=@OUTPUT1@', '--output=@OUTPUT0@'] ) - glib_dep = dependency('glib-2.0', native: true) + glib_dep = dependency('glib-2.0', native: true, static: false) idef_parser = executable( 'idef-parser', diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h index 8be45c69c99ab53e5da3d9d6be304e082a14e2e5..ba100c21a264d1041d2c05d38ed15313d411f422 100644 --- a/target/hppa/cpu.h +++ b/target/hppa/cpu.h @@ -188,7 +188,7 @@ typedef struct CPUArchState { target_ulong psw; /* All psw bits except the following: */ target_ulong psw_n; /* boolean */ - target_long psw_v; /* in most significant bit */ + target_long psw_v; /* in bit 31 */ /* Splitting the carry-borrow field into the MSB and "the rest", allows * for "the rest" to be deleted when it is unused, but the MSB is in use. diff --git a/target/hppa/helper.c b/target/hppa/helper.c index 859644c47af1d709772277950cb6cfcf94b6ad9c..9e35b65f2939d05713c13640ad41d85b9813a46e 100644 --- a/target/hppa/helper.c +++ b/target/hppa/helper.c @@ -53,7 +53,7 @@ target_ulong cpu_hppa_get_psw(CPUHPPAState *env) } psw |= env->psw_n * PSW_N; - psw |= (env->psw_v < 0) * PSW_V; + psw |= ((env->psw_v >> 31) & 1) * PSW_V; psw |= env->psw; return psw; diff --git a/target/i386/cpu.c b/target/i386/cpu.c index cd16cb893daf8d2a2be94a91f5ad47de787a6080..f79d0c9abf317fd363d521de58294c872f5749d6 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -29,6 +29,7 @@ #include "hvf/hvf-i386.h" #include "kvm/kvm_i386.h" #include "sev.h" +#include "csv.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "qapi/qapi-visit-machine.h" @@ -905,9 +906,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .type = CPUID_FEATURE_WORD, .feat_names = { "fsgsbase", "tsc-adjust", "sgx", "bmi1", - "hle", "avx2", NULL, "smep", + "hle", "avx2", "fdp-excptn-only", "smep", "bmi2", "erms", "invpcid", "rtm", - NULL, NULL, "mpx", NULL, + NULL, "zero-fcs-fds", "mpx", NULL, "avx512f", "avx512dq", "rdseed", "adx", "smap", "avx512ifma", "pcommit", "clflushopt", "clwb", "intel-pt", "avx512pf", "avx512er", @@ -961,13 +962,13 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_1_EAX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - NULL, NULL, NULL, NULL, + "sha512", "sm3", "sm4", NULL, "avx-vnni", "avx512-bf16", NULL, "cmpccxadd", NULL, NULL, "fzrm", "fsrs", "fsrc", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + NULL, "fred", "lkgs", "wrmsrns", NULL, "amx-fp16", NULL, "avx-ifma", - NULL, NULL, NULL, NULL, + NULL, NULL, "lam", NULL, NULL, NULL, NULL, NULL, }, .cpuid = { @@ -999,8 +1000,8 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_2_EDX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - NULL, NULL, NULL, NULL, - NULL, "mcdt-no", NULL, NULL, + "intel-psfd", "ipred-ctrl", "rrsba-ctrl", "ddpd-u", + "bhi-ctrl", "mcdt-no", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1156,9 +1157,9 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "taa-no", NULL, NULL, NULL, NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no", NULL, "fb-clear", NULL, NULL, - NULL, NULL, NULL, NULL, - "pbrsb-no", NULL, "gds-no", NULL, - NULL, NULL, NULL, NULL, + "bhi-no", NULL, NULL, NULL, + "pbrsb-no", NULL, "gds-no", "rfds-no", + "rfds-clear", NULL, NULL, NULL, }, .msr = { .index = MSR_IA32_ARCH_CAPABILITIES, @@ -1270,7 +1271,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "vmx-exit-save-efer", "vmx-exit-load-efer", "vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs", NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL, - NULL, "vmx-exit-load-pkrs", NULL, NULL, + NULL, "vmx-exit-load-pkrs", NULL, "vmx-exit-secondary-ctls", }, .msr = { .index = MSR_IA32_VMX_TRUE_EXIT_CTLS, @@ -1285,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { NULL, "vmx-entry-ia32e-mode", NULL, NULL, NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer", "vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL, - NULL, NULL, "vmx-entry-load-pkrs", NULL, + NULL, NULL, "vmx-entry-load-pkrs", "vmx-entry-load-fred", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }, @@ -1343,6 +1344,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [54] = "vmx-ins-outs", [55] = "vmx-true-ctls", [56] = "vmx-any-errcode", + [58] = "vmx-nested-exception", }, .msr = { .index = MSR_IA32_VMX_BASIC, @@ -1549,8 +1551,20 @@ static FeatureDep feature_dependencies[] = { .to = { FEAT_SVM, ~0ull }, }, { - .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, - .to = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, + .from = { FEAT_7_0_ECX, CPUID_7_0_ECX_WAITPKG }, + .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE }, + }, + { + .from = { FEAT_8000_0001_EDX, CPUID_EXT2_LM }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, + }, + { + .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_LKGS }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, + }, + { + .from = { FEAT_7_1_EAX, CPUID_7_1_EAX_WRMSRNS }, + .to = { FEAT_7_1_EAX, CPUID_7_1_EAX_FRED }, }, }; @@ -1671,9 +1685,10 @@ static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) * Returns the set of feature flags that are supported and migratable by * QEMU, for a given FeatureWord. */ -static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) +static uint64_t x86_cpu_get_migratable_flags(X86CPU *cpu, FeatureWord w) { FeatureWordInfo *wi = &feature_word_info[w]; + CPUX86State *env = &cpu->env; uint64_t r = 0; int i; @@ -1687,6 +1702,12 @@ static uint64_t x86_cpu_get_migratable_flags(FeatureWord w) r |= f; } } + + /* when tsc-khz is set explicitly, invtsc is migratable */ + if ((w == FEAT_8000_0007_EDX) && env->user_tsc_khz) { + r |= CPUID_APM_INVTSC; + } + return r; } @@ -2162,6 +2183,56 @@ static const CPUCaches epyc_genoa_cache_info = { }, }; +static const CPUCaches dharma_cache_info = { + .l1d_cache = &(CPUCacheInfo) { + .type = DATA_CACHE, + .level = 1, + .size = 32 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 64, + .lines_per_tag = 1, + .self_init = 1, + .no_invd_sharing = true, + }, + .l1i_cache = &(CPUCacheInfo) { + .type = INSTRUCTION_CACHE, + .level = 1, + .size = 32 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 64, + .lines_per_tag = 1, + .self_init = 1, + .no_invd_sharing = true, + }, + .l2_cache = &(CPUCacheInfo) { + .type = UNIFIED_CACHE, + .level = 2, + .size = 512 * KiB, + .line_size = 64, + .associativity = 8, + .partitions = 1, + .sets = 1024, + .lines_per_tag = 1, + }, + .l3_cache = &(CPUCacheInfo) { + .type = UNIFIED_CACHE, + .level = 3, + .size = 16 * MiB, + .line_size = 64, + .associativity = 16, + .partitions = 1, + .sets = 16384, + .lines_per_tag = 1, + .self_init = true, + .inclusive = true, + .complex_indexing = true, + }, +}; + /* The following VMX features are not supported by KVM and are left out in the * CPU definitions: * @@ -3402,6 +3473,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { }, { .version = 4, + .note = "IBRS, EPT switching, no TSX", .props = (PropValue[]) { { "vmx-eptp-switching", "on" }, { /* end of list */ } @@ -3536,7 +3608,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { }, }, { .version = 4, - .note = "ARCH_CAPABILITIES, no TSX", + .note = "ARCH_CAPABILITIES, EPT switching, no TSX", .props = (PropValue[]) { { "vmx-eptp-switching", "on" }, { /* end of list */ } @@ -3822,6 +3894,16 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, + { + .version = 7, + .note = "TSX, taa-no", + .props = (PropValue[]) { + /* Restore TSX features removed by -v2 above */ + { "hle", "on" }, + { "rtm", "on" }, + { /* end of list */ } + }, + }, { /* end of list */ } } }, @@ -3960,6 +4042,17 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, + { + .version = 3, + .props = (PropValue[]) { + { "ss", "on" }, + { "tsc-adjust", "on" }, + { "cldemote", "on" }, + { "movdiri", "on" }, + { "movdir64b", "on" }, + { /* end of list */ } + } + }, { /* end of list */ } } }, @@ -4099,6 +4192,286 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, + { + .name = "SierraForest", + .level = 0x23, + .vendor = CPUID_VENDOR_INTEL, + .family = 6, + .model = 175, + .stepping = 0, + /* + * please keep the ascending order so that we can have a clear view of + * bit position of each feature. + */ + .features[FEAT_1_EDX] = + CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | + CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | + CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | + CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR | + CPUID_SSE | CPUID_SSE2, + .features[FEAT_1_ECX] = + CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 | + CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 | + CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | + CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES | + CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_WBNOINVD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | + CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | + CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | + CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = + CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI | + CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ | + CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT, + .features[FEAT_7_0_EDX] = + CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE | + CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES | + CPUID_7_0_EDX_SPEC_CTRL_SSBD, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | + MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | + MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO | + MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO | + MSR_ARCH_CAP_PBRSB_NO, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD | + CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA, + .features[FEAT_7_1_EDX] = + CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT, + .features[FEAT_7_2_EDX] = + CPUID_7_2_EDX_MCDT_NO, + .features[FEAT_VMX_BASIC] = + MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS, + .features[FEAT_VMX_ENTRY_CTLS] = + VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE | + VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER, + .features[FEAT_VMX_EPT_VPID_CAPS] = + MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | + MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB | + MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS | + MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | + MSR_VMX_EPT_INVVPID_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS, + .features[FEAT_VMX_EXIT_CTLS] = + VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | + VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT | + VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | + VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, + .features[FEAT_VMX_MISC] = + MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT | + MSR_VMX_MISC_VMWRITE_VMEXIT, + .features[FEAT_VMX_PINBASED_CTLS] = + VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING | + VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER | + VMX_PIN_BASED_POSTED_INTR, + .features[FEAT_VMX_PROCBASED_CTLS] = + VMX_CPU_BASED_VIRTUAL_INTR_PENDING | + VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | + VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | + VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | + VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | + VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | + VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING | + VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING | + VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG | + VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING | + VMX_CPU_BASED_PAUSE_EXITING | + VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, + .features[FEAT_VMX_SECONDARY_CTLS] = + VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC | + VMX_SECONDARY_EXEC_RDTSCP | + VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING | + VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | + VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | + VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + VMX_SECONDARY_EXEC_RDRAND_EXITING | + VMX_SECONDARY_EXEC_ENABLE_INVPCID | + VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | + VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML | + VMX_SECONDARY_EXEC_XSAVES, + .features[FEAT_VMX_VMFUNC] = + MSR_VMX_VMFUNC_EPT_SWITCHING, + .xlevel = 0x80000008, + .model_id = "Intel Xeon Processor (SierraForest)", + .versions = (X86CPUVersionDefinition[]) { + { .version = 1 }, + { + .version = 2, + .props = (PropValue[]) { + { "ss", "on" }, + { "tsc-adjust", "on" }, + { "cldemote", "on" }, + { "movdiri", "on" }, + { "movdir64b", "on" }, + { "gds-no", "on" }, + { "rfds-no", "on" }, + { "lam", "on" }, + { "intel-psfd", "on"}, + { "ipred-ctrl", "on"}, + { "rrsba-ctrl", "on"}, + { "bhi-ctrl", "on"}, + { "stepping", "3" }, + { /* end of list */ } + } + }, + { /* end of list */ }, + }, + }, + { + .name = "ClearwaterForest", + .level = 0x23, + .xlevel = 0x80000008, + .vendor = CPUID_VENDOR_INTEL, + .family = 6, + .model = 221, + .stepping = 0, + /* + * please keep the ascending order so that we can have a clear view of + * bit position of each feature. + */ + .features[FEAT_1_EDX] = + CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | + CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | + CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | + CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR | + CPUID_SSE | CPUID_SSE2 | CPUID_SS, + .features[FEAT_1_ECX] = + CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 | + CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 | + CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | + CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES | + CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | CPUID_EXT_RDRAND, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_WBNOINVD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_TSC_ADJUST | + CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | + CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | + CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | + CPUID_7_0_EBX_CLFLUSHOPT | CPUID_7_0_EBX_CLWB | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = + CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_GFNI | + CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ | + CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT | + CPUID_7_0_ECX_CLDEMOTE | CPUID_7_0_ECX_MOVDIRI | + CPUID_7_0_ECX_MOVDIR64B, + .features[FEAT_7_0_EDX] = + CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE | + CPUID_7_0_EDX_SPEC_CTRL | CPUID_7_0_EDX_ARCH_CAPABILITIES | + CPUID_7_0_EDX_SPEC_CTRL_SSBD, + .features[FEAT_ARCH_CAPABILITIES] = + MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL | + MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO | + MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_SBDR_SSDP_NO | + MSR_ARCH_CAP_FBSDP_NO | MSR_ARCH_CAP_PSDP_NO | + MSR_ARCH_CAP_BHI_NO | MSR_ARCH_CAP_PBRSB_NO | + MSR_ARCH_CAP_GDS_NO | MSR_ARCH_CAP_RFDS_NO, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_7_1_EAX] = + CPUID_7_1_EAX_SHA512 | CPUID_7_1_EAX_SM3 | CPUID_7_1_EAX_SM4 | + CPUID_7_1_EAX_AVX_VNNI | CPUID_7_1_EAX_CMPCCXADD | + CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_AVX_IFMA | + CPUID_7_1_EAX_LAM, + .features[FEAT_7_1_EDX] = + CPUID_7_1_EDX_AVX_VNNI_INT8 | CPUID_7_1_EDX_AVX_NE_CONVERT | + CPUID_7_1_EDX_AVX_VNNI_INT16 | CPUID_7_1_EDX_PREFETCHITI, + .features[FEAT_7_2_EDX] = + CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL | + CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_DDPD_U | + CPUID_7_2_EDX_BHI_CTRL | CPUID_7_2_EDX_MCDT_NO, + .features[FEAT_VMX_BASIC] = + MSR_VMX_BASIC_INS_OUTS | MSR_VMX_BASIC_TRUE_CTLS, + .features[FEAT_VMX_ENTRY_CTLS] = + VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS | VMX_VM_ENTRY_IA32E_MODE | + VMX_VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_ENTRY_LOAD_IA32_PAT | VMX_VM_ENTRY_LOAD_IA32_EFER, + .features[FEAT_VMX_EPT_VPID_CAPS] = + MSR_VMX_EPT_EXECONLY | MSR_VMX_EPT_PAGE_WALK_LENGTH_4 | + MSR_VMX_EPT_WB | MSR_VMX_EPT_2MB | MSR_VMX_EPT_1GB | + MSR_VMX_EPT_INVEPT | MSR_VMX_EPT_AD_BITS | + MSR_VMX_EPT_INVEPT_SINGLE_CONTEXT | MSR_VMX_EPT_INVEPT_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID | MSR_VMX_EPT_INVVPID_SINGLE_ADDR | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT | + MSR_VMX_EPT_INVVPID_ALL_CONTEXT | + MSR_VMX_EPT_INVVPID_SINGLE_CONTEXT_NOGLOBALS, + .features[FEAT_VMX_EXIT_CTLS] = + VMX_VM_EXIT_SAVE_DEBUG_CONTROLS | + VMX_VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VMX_VM_EXIT_ACK_INTR_ON_EXIT | VMX_VM_EXIT_SAVE_IA32_PAT | + VMX_VM_EXIT_LOAD_IA32_PAT | VMX_VM_EXIT_SAVE_IA32_EFER | + VMX_VM_EXIT_LOAD_IA32_EFER | VMX_VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, + .features[FEAT_VMX_MISC] = + MSR_VMX_MISC_STORE_LMA | MSR_VMX_MISC_ACTIVITY_HLT | + MSR_VMX_MISC_VMWRITE_VMEXIT, + .features[FEAT_VMX_PINBASED_CTLS] = + VMX_PIN_BASED_EXT_INTR_MASK | VMX_PIN_BASED_NMI_EXITING | + VMX_PIN_BASED_VIRTUAL_NMIS | VMX_PIN_BASED_VMX_PREEMPTION_TIMER | + VMX_PIN_BASED_POSTED_INTR, + .features[FEAT_VMX_PROCBASED_CTLS] = + VMX_CPU_BASED_VIRTUAL_INTR_PENDING | + VMX_CPU_BASED_USE_TSC_OFFSETING | VMX_CPU_BASED_HLT_EXITING | + VMX_CPU_BASED_INVLPG_EXITING | VMX_CPU_BASED_MWAIT_EXITING | + VMX_CPU_BASED_RDPMC_EXITING | VMX_CPU_BASED_RDTSC_EXITING | + VMX_CPU_BASED_CR3_LOAD_EXITING | VMX_CPU_BASED_CR3_STORE_EXITING | + VMX_CPU_BASED_CR8_LOAD_EXITING | VMX_CPU_BASED_CR8_STORE_EXITING | + VMX_CPU_BASED_TPR_SHADOW | VMX_CPU_BASED_VIRTUAL_NMI_PENDING | + VMX_CPU_BASED_MOV_DR_EXITING | VMX_CPU_BASED_UNCOND_IO_EXITING | + VMX_CPU_BASED_USE_IO_BITMAPS | VMX_CPU_BASED_MONITOR_TRAP_FLAG | + VMX_CPU_BASED_USE_MSR_BITMAPS | VMX_CPU_BASED_MONITOR_EXITING | + VMX_CPU_BASED_PAUSE_EXITING | + VMX_CPU_BASED_ACTIVATE_SECONDARY_CONTROLS, + .features[FEAT_VMX_SECONDARY_CTLS] = + VMX_SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + VMX_SECONDARY_EXEC_ENABLE_EPT | VMX_SECONDARY_EXEC_DESC | + VMX_SECONDARY_EXEC_RDTSCP | + VMX_SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + VMX_SECONDARY_EXEC_ENABLE_VPID | VMX_SECONDARY_EXEC_WBINVD_EXITING | + VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST | + VMX_SECONDARY_EXEC_APIC_REGISTER_VIRT | + VMX_SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + VMX_SECONDARY_EXEC_RDRAND_EXITING | + VMX_SECONDARY_EXEC_ENABLE_INVPCID | + VMX_SECONDARY_EXEC_ENABLE_VMFUNC | VMX_SECONDARY_EXEC_SHADOW_VMCS | + VMX_SECONDARY_EXEC_RDSEED_EXITING | VMX_SECONDARY_EXEC_ENABLE_PML | + VMX_SECONDARY_EXEC_XSAVES, + .features[FEAT_VMX_VMFUNC] = + MSR_VMX_VMFUNC_EPT_SWITCHING, + .model_id = "Intel Xeon Processor (ClearwaterForest)", + .versions = (X86CPUVersionDefinition[]) { + { .version = 1 }, + { /* end of list */ }, + }, + }, { .name = "Denverton", .level = 21, @@ -4657,6 +5030,20 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, + { .version = 3, + .props = (PropValue[]) { + { "xsaves", "off" }, + { "perfctr-core", "on" }, + { "clzero", "on" }, + { "xsaveerptr", "on" }, + { "aes", "on" }, + { "pclmulqdq", "on" }, + { "sha-ni", "on" }, + { "model-id", + "Hygon Dhyana-v3 processor" }, + { /* end of list */ } + }, + }, { /* end of list */ } } }, @@ -4852,7 +5239,7 @@ static const X86CPUDefinition builtin_x86_defs[] = { CPUID_8000_0008_EBX_STIBP_ALWAYS_ON | CPUID_8000_0008_EBX_AMD_SSBD | CPUID_8000_0008_EBX_AMD_PSFD, .features[FEAT_8000_0021_EAX] = - CPUID_8000_0021_EAX_No_NESTED_DATA_BP | + CPUID_8000_0021_EAX_NO_NESTED_DATA_BP | CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING | CPUID_8000_0021_EAX_NULL_SEL_CLR_BASE | CPUID_8000_0021_EAX_AUTO_IBRS, @@ -4888,6 +5275,55 @@ static const X86CPUDefinition builtin_x86_defs[] = { .model_id = "AMD EPYC-Genoa Processor", .cache_info = &epyc_genoa_cache_info, }, + { + .name = "Dharma", + .level = 0xd, + .vendor = CPUID_VENDOR_HYGON, + .family = 24, + .model = 4, + .stepping = 0, + .features[FEAT_1_EDX] = + CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | CPUID_CLFLUSH | + CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | CPUID_PGE | + CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | CPUID_MCE | + CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | CPUID_DE | + CPUID_VME | CPUID_FP87, + .features[FEAT_1_ECX] = + CPUID_EXT_RDRAND | CPUID_EXT_F16C | CPUID_EXT_AVX | + CPUID_EXT_XSAVE | CPUID_EXT_AES | CPUID_EXT_POPCNT | + CPUID_EXT_MOVBE | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | + CPUID_EXT_CX16 | CPUID_EXT_FMA | CPUID_EXT_SSSE3 | + CPUID_EXT_MONITOR | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_PDPE1GB | + CPUID_EXT2_FFXSR | CPUID_EXT2_MMXEXT | CPUID_EXT2_NX | + CPUID_EXT2_SYSCALL, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_OSVW | CPUID_EXT3_3DNOWPREFETCH | + CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | + CPUID_EXT3_CR8LEG | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM | + CPUID_EXT3_TOPOEXT | CPUID_EXT3_PERFCORE, + .features[FEAT_8000_0008_EBX] = + CPUID_8000_0008_EBX_CLZERO | CPUID_8000_0008_EBX_XSAVEERPTR | + CPUID_8000_0008_EBX_IBPB | CPUID_8000_0008_EBX_IBRS | + CPUID_8000_0008_EBX_STIBP | CPUID_8000_0008_EBX_AMD_SSBD, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_AVX2 | + CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_RDSEED | + CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | + CPUID_7_0_EBX_SHA_NI, + .features[FEAT_7_0_ECX] = CPUID_7_0_ECX_UMIP, + .features[FEAT_XSAVE] = + CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC | + CPUID_XSAVE_XGETBV1, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, + .features[FEAT_SVM] = + CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE, + .xlevel = 0x8000001E, + .model_id = "Hygon Dharma Processor", + .cache_info = &dharma_cache_info, + }, }; /* @@ -5475,6 +5911,7 @@ static void x86_cpu_get_unavailable_features(Object *obj, Visitor *v, x86_cpu_list_feature_names(xc->filtered_features, &result); visit_type_strList(v, "unavailable-features", &result, errp); + qapi_free_strList(result); } /* Print all cpuid feature names in featureset @@ -5683,8 +6120,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) #endif /* !CONFIG_USER_ONLY */ -uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, - bool migratable_only) +uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) { FeatureWordInfo *wi = &feature_word_info[w]; uint64_t r = 0; @@ -5726,8 +6162,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, r &= ~unavail; } #endif - if (migratable_only) { - r &= x86_cpu_get_migratable_flags(w); + if (cpu && cpu->migratable) { + r &= x86_cpu_get_migratable_flags(cpu, w); } return r; } @@ -6566,6 +7002,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) { /* 64 bit processor */ *eax |= (cpu_x86_virtual_addr_width(env) << 8); + *eax |= (cpu->guest_phys_bits << 16); } *ebx = env->features[FEAT_8000_0008_EBX]; if (cs->nr_cores * cs->nr_threads > 1) { @@ -6596,9 +7033,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } break; case 0x8000001D: + /* Populate AMD Processor Cache Information */ *eax = 0; if (cpu->cache_info_passthrough) { x86_cpu_get_cache_cpuid(index, count, eax, ebx, ecx, edx); + + /* + * Clear BITs[25:14] and then update them based on the guest + * vCPU topology, like what we do in encode_cache_cpuid8000001d + * when cache_info_passthrough is not enabled. + */ + *eax &= ~0x03FFC000; + switch (count) { + case 0: /* L1 dcache info */ + case 1: /* L1 icache info */ + case 2: /* L2 cache info */ + *eax |= ((topo_info.threads_per_core - 1) << 14); + break; + case 3: /* L3 cache info */ + *eax |= ((topo_info.cores_per_die * + topo_info.threads_per_core - 1) << 14); + break; + default: /* end of info */ + *eax = *ebx = *ecx = *edx = 0; + break; + } break; } switch (count) { @@ -6660,6 +7119,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if (sev_enabled()) { *eax = 0x2; *eax |= sev_es_enabled() ? 0x8 : 0; + *eax |= csv3_enabled() ? 0x40000000 : 0; /* bit 30 for CSV3 */ *ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */ *ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */ } @@ -6927,6 +7387,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { env->features[FEAT_XSAVE_XCR0_LO] = 0; env->features[FEAT_XSAVE_XCR0_HI] = 0; + env->features[FEAT_XSAVE_XSS_LO] = 0; + env->features[FEAT_XSAVE_XSS_HI] = 0; return; } @@ -6945,9 +7407,9 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) } env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; - env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; + env->features[FEAT_XSAVE_XCR0_HI] = (mask & CPUID_XSTATE_XCR0_MASK) >> 32; env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; - env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; + env->features[FEAT_XSAVE_XSS_HI] = (mask & CPUID_XSTATE_XSS_MASK) >> 32; } /***** Steps involved on loading and filtering CPUID data @@ -7022,7 +7484,7 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) * by the user. */ env->features[w] |= - x86_cpu_get_supported_feature_word(w, cpu->migratable) & + x86_cpu_get_supported_feature_word(cpu, w) & ~env->user_features[w] & ~feature_word_info[w].no_autoenable_flags; } @@ -7148,7 +7610,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool verbose) for (w = 0; w < FEATURE_WORDS; w++) { uint64_t host_feat = - x86_cpu_get_supported_feature_word(w, false); + x86_cpu_get_supported_feature_word(NULL, w); uint64_t requested_features = env->features[w]; uint64_t unavailable_features = requested_features & ~host_feat; mark_unavailable_features(cpu, w, unavailable_features, prefix); @@ -7264,7 +7726,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) env->features[FEAT_PERF_CAPABILITIES] & PERF_CAP_LBR_FMT; if (requested_lbr_fmt && kvm_enabled()) { uint64_t host_perf_cap = - x86_cpu_get_supported_feature_word(FEAT_PERF_CAPABILITIES, false); + x86_cpu_get_supported_feature_word(NULL, FEAT_PERF_CAPABILITIES); unsigned host_lbr_fmt = host_perf_cap & PERF_CAP_LBR_FMT; if (!cpu->enable_pmu) { @@ -7318,6 +7780,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) goto out; } + if (cpu->guest_phys_bits == -1) { + /* + * If it was not set by the user, or by the accelerator via + * cpu_exec_realizefn, clear. + */ + cpu->guest_phys_bits = 0; + } + if (cpu->ucode_rev == 0) { /* * The default is the same as KVM's. Note that this check @@ -7368,6 +7838,14 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) if (cpu->phys_bits == 0) { cpu->phys_bits = TCG_PHYS_ADDR_BITS; } + if (cpu->guest_phys_bits && + (cpu->guest_phys_bits > cpu->phys_bits || + cpu->guest_phys_bits < 32)) { + error_setg(errp, "guest-phys-bits should be between 32 and %u " + " (but is %u)", + cpu->phys_bits, cpu->guest_phys_bits); + return; + } } else { /* For 32 bit systems don't use the user set value, but keep * phys_bits consistent with what we tell the guest. @@ -7376,6 +7854,10 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) error_setg(errp, "phys-bits is not user-configurable in 32 bit"); return; } + if (cpu->guest_phys_bits != 0) { + error_setg(errp, "guest-phys-bits is not user-configurable in 32 bit"); + return; + } if (env->features[FEAT_1_EDX] & (CPUID_PSE36 | CPUID_PAE)) { cpu->phys_bits = 36; @@ -7664,6 +8146,24 @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value) cpu->env.eip = value; } + +/* At present, we check the vm is *LARGE* or not, i.e. whether + * the memory size is more than 4T or not. + */ +const uint64_t large_vm_mem_size = 0x40000000000UL; +void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu) +{ + /* If there is not a large vm, we set the phys_bits to 42 bits, + * otherwise, we increase the phys_bits to 46 bits. + */ + if (ram_size < large_vm_mem_size) { + cpu->phys_bits = DEFAULT_VM_CPU_PHYS_BITS; + } else { + cpu->phys_bits = LARGE_VM_CPU_PHYS_BITS; + cpu->fill_mtrr_mask = true; + } +} + static vaddr x86_cpu_get_pc(CPUState *cs) { X86CPU *cpu = X86_CPU(cs); @@ -7864,9 +8364,10 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("x-force-features", X86CPU, force_features, false), DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true), DEFINE_PROP_UINT32("phys-bits", X86CPU, phys_bits, 0), + DEFINE_PROP_UINT32("guest-phys-bits", X86CPU, guest_phys_bits, -1), DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false), DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0), - DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true), + DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, false), DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7, UINT32_MAX), DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX), diff --git a/target/i386/cpu.h b/target/i386/cpu.h index ef987f344cff2409b7746b12f654ed7e3fb560dd..4424e58d1ba24fd9578623c92ca169e48e73e22b 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -24,6 +24,7 @@ #include "cpu-qom.h" #include "kvm/hyperv-proto.h" #include "exec/cpu-defs.h" +#include "exec/cpu-common.h" #include "qapi/qapi-types-common.h" #include "qemu/cpu-float.h" #include "qemu/timer.h" @@ -261,6 +262,13 @@ typedef enum X86Seg { #define CR4_SMAP_MASK (1U << 21) #define CR4_PKE_MASK (1U << 22) #define CR4_PKS_MASK (1U << 24) +#define CR4_LAM_SUP_MASK (1U << 28) + +#ifdef TARGET_X86_64 +#define CR4_FRED_MASK (1ULL << 32) +#else +#define CR4_FRED_MASK 0 +#endif #define CR4_RESERVED_MASK \ (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \ @@ -269,7 +277,8 @@ typedef enum X86Seg { | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \ | CR4_LA57_MASK \ | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \ - | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK)) + | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \ + | CR4_LAM_SUP_MASK | CR4_FRED_MASK)) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -519,9 +528,22 @@ typedef enum X86Seg { #define MSR_VM_HSAVE_PA 0xc0010117 +#define MSR_AMD64_SEV_ES_GHCB 0xc0010130 + #define MSR_IA32_XFD 0x000001c4 #define MSR_IA32_XFD_ERR 0x000001c5 +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x000001cc /* Stack level 0 regular stack pointer */ +#define MSR_IA32_FRED_RSP1 0x000001cd /* Stack level 1 regular stack pointer */ +#define MSR_IA32_FRED_RSP2 0x000001ce /* Stack level 2 regular stack pointer */ +#define MSR_IA32_FRED_RSP3 0x000001cf /* Stack level 3 regular stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x000001d0 /* FRED exception stack levels */ +#define MSR_IA32_FRED_SSP1 0x000001d1 /* Stack level 1 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_SSP2 0x000001d2 /* Stack level 2 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_SSP3 0x000001d3 /* Stack level 3 shadow stack pointer in ring 0 */ +#define MSR_IA32_FRED_CONFIG 0x000001d4 /* FRED Entrypoint and interrupt stack level */ + #define MSR_IA32_BNDCFGS 0x00000d90 #define MSR_IA32_XSS 0x00000da0 #define MSR_IA32_UMWAIT_CONTROL 0xe1 @@ -633,8 +655,7 @@ typedef enum FeatureWord { } FeatureWord; typedef uint64_t FeatureWordArray[FEATURE_WORDS]; -uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, - bool migratable_only); +uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); /* cpuid_features bits */ #define CPUID_FP87 (1U << 0) @@ -780,6 +801,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ #define CPUID_7_0_EBX_FSGSBASE (1U << 0) +/* Support TSC adjust MSR */ +#define CPUID_7_0_EBX_TSC_ADJUST (1U << 1) /* Support SGX */ #define CPUID_7_0_EBX_SGX (1U << 2) /* 1st Group of Advanced Bit Manipulation Extensions */ @@ -788,6 +811,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_0_EBX_HLE (1U << 4) /* Intel Advanced Vector Extensions 2 */ #define CPUID_7_0_EBX_AVX2 (1U << 5) +/* FPU data pointer updated only on x87 exceptions */ +#define CPUID_7_0_EBX_FDP_EXCPTN_ONLY (1u << 6) /* Supervisor-mode Execution Prevention */ #define CPUID_7_0_EBX_SMEP (1U << 7) /* 2nd Group of Advanced Bit Manipulation Extensions */ @@ -798,6 +823,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_0_EBX_INVPCID (1U << 10) /* Restricted Transactional Memory */ #define CPUID_7_0_EBX_RTM (1U << 11) +/* Zero out FPU CS and FPU DS */ +#define CPUID_7_0_EBX_ZERO_FCS_FDS (1U << 13) /* Memory Protection Extension */ #define CPUID_7_0_EBX_MPX (1U << 14) /* AVX-512 Foundation */ @@ -909,6 +936,12 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, /* Speculative Store Bypass Disable */ #define CPUID_7_0_EDX_SPEC_CTRL_SSBD (1U << 31) +/* SHA512 Instruction */ +#define CPUID_7_1_EAX_SHA512 (1U << 0) +/* SM3 Instruction */ +#define CPUID_7_1_EAX_SM3 (1U << 1) +/* SM4 Instruction */ +#define CPUID_7_1_EAX_SM4 (1U << 2) /* AVX VNNI Instruction */ #define CPUID_7_1_EAX_AVX_VNNI (1U << 4) /* AVX512 BFloat16 Instruction */ @@ -921,20 +954,40 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_7_1_EAX_FSRS (1U << 11) /* Fast Short REP CMPS/SCAS */ #define CPUID_7_1_EAX_FSRC (1U << 12) +/* Flexible return and event delivery (FRED) */ +#define CPUID_7_1_EAX_FRED (1U << 17) +/* Load into IA32_KERNEL_GS_BASE (LKGS) */ +#define CPUID_7_1_EAX_LKGS (1U << 18) +/* Non-Serializing Write to Model Specific Register (WRMSRNS) */ +#define CPUID_7_1_EAX_WRMSRNS (1U << 19) /* Support Tile Computational Operations on FP16 Numbers */ #define CPUID_7_1_EAX_AMX_FP16 (1U << 21) /* Support for VPMADD52[H,L]UQ */ #define CPUID_7_1_EAX_AVX_IFMA (1U << 23) +/* Linear Address Masking */ +#define CPUID_7_1_EAX_LAM (1U << 26) /* Support for VPDPB[SU,UU,SS]D[,S] */ #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4) /* AVX NE CONVERT Instructions */ #define CPUID_7_1_EDX_AVX_NE_CONVERT (1U << 5) +/* AVX-VNNI-INT16 Instructions */ +#define CPUID_7_1_EDX_AVX_VNNI_INT16 (1U << 10) /* AMX COMPLEX Instructions */ #define CPUID_7_1_EDX_AMX_COMPLEX (1U << 8) /* PREFETCHIT0/1 Instructions */ #define CPUID_7_1_EDX_PREFETCHITI (1U << 14) +/* Indicate bit 7 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_PSFD (1U << 0) +/* Indicate bits 3 and 4 of the IA32_SPEC_CTRL MSR are supported */ +#define CPUID_7_2_EDX_IPRED_CTRL (1U << 1) +/* Indicate bits 5 and 6 of the IA32_SPEC_CTRL MSR are supported */ +#define CPUID_7_2_EDX_RRSBA_CTRL (1U << 2) +/* Indicate bit 8 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_DDPD_U (1U << 3) +/* Indicate bit 10 of the IA32_SPEC_CTRL MSR is supported */ +#define CPUID_7_2_EDX_BHI_CTRL (1U << 4) /* Do not exhibit MXCSR Configuration Dependent Timing (MCDT) behavior */ #define CPUID_7_2_EDX_MCDT_NO (1U << 5) @@ -964,7 +1017,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define CPUID_8000_0008_EBX_AMD_PSFD (1U << 28) /* Processor ignores nested data breakpoints */ -#define CPUID_8000_0021_EAX_No_NESTED_DATA_BP (1U << 0) +#define CPUID_8000_0021_EAX_NO_NESTED_DATA_BP (1U << 0) /* LFENCE is always serializing */ #define CPUID_8000_0021_EAX_LFENCE_ALWAYS_SERIALIZING (1U << 2) /* Null Selector Clears Base */ @@ -1028,7 +1081,10 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define MSR_ARCH_CAP_FBSDP_NO (1U << 14) #define MSR_ARCH_CAP_PSDP_NO (1U << 15) #define MSR_ARCH_CAP_FB_CLEAR (1U << 17) +#define MSR_ARCH_CAP_BHI_NO (1U << 20) #define MSR_ARCH_CAP_PBRSB_NO (1U << 24) +#define MSR_ARCH_CAP_GDS_NO (1U << 26) +#define MSR_ARCH_CAP_RFDS_NO (1U << 27) #define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5) @@ -1040,6 +1096,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define MSR_VMX_BASIC_INS_OUTS (1ULL << 54) #define MSR_VMX_BASIC_TRUE_CTLS (1ULL << 55) #define MSR_VMX_BASIC_ANY_ERRCODE (1ULL << 56) +#define MSR_VMX_BASIC_NESTED_EXCEPTION (1ULL << 58) #define MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK 0x1Full #define MSR_VMX_MISC_STORE_LMA (1ULL << 5) @@ -1135,6 +1192,7 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, #define VMX_VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VMX_VM_EXIT_LOAD_IA32_PKRS 0x20000000 +#define VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS 0x80000000 #define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VMX_VM_ENTRY_IA32E_MODE 0x00000200 @@ -1672,6 +1730,17 @@ typedef struct CPUArchState { target_ulong cstar; target_ulong fmask; target_ulong kernelgsbase; + + /* FRED MSRs */ + uint64_t fred_rsp0; + uint64_t fred_rsp1; + uint64_t fred_rsp2; + uint64_t fred_rsp3; + uint64_t fred_stklvls; + uint64_t fred_ssp1; + uint64_t fred_ssp2; + uint64_t fred_ssp3; + uint64_t fred_config; #endif uint64_t tsc_adjust; @@ -1884,6 +1953,9 @@ typedef struct CPUArchState { /* Number of dies within this CPU package. */ unsigned nr_dies; + + /* GHCB guest physical address info */ + uint64_t ghcb_gpa; } CPUX86State; struct kvm_msrs; @@ -2019,6 +2091,14 @@ struct ArchCPU { /* Number of physical address bits supported */ uint32_t phys_bits; + /* + * Number of guest physical address bits available. Usually this is + * identical to host physical address bits. With NPT or EPT 4-level + * paging, guest physical address space might be restricted to 48 bits + * even if the host cpu supports more physical address bits. + */ + uint32_t guest_phys_bits; + /* in order to simplify APIC support, we leave this pointer to the user */ struct DeviceState *apic_state; @@ -2081,6 +2161,11 @@ struct X86CPUClass { extern const VMStateDescription vmstate_x86_cpu; #endif +#define DEFAULT_VM_CPU_PHYS_BITS 42 +#define LARGE_VM_CPU_PHYS_BITS 46 + +void x86_cpu_adjuest_by_ram_size(ram_addr_t ram_size, X86CPU *cpu); + int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request); int x86_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu, @@ -2519,6 +2604,12 @@ static inline uint64_t cr4_reserved_bits(CPUX86State *env) if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) { reserved_bits |= CR4_PKS_MASK; } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { + reserved_bits |= CR4_LAM_SUP_MASK; + } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED)) { + reserved_bits |= CR4_FRED_MASK; + } return reserved_bits; } diff --git a/target/i386/csv-sysemu-stub.c b/target/i386/csv-sysemu-stub.c new file mode 100644 index 0000000000000000000000000000000000000000..735cce0e4b78c5e13818a8c508c1d5f4304432b5 --- /dev/null +++ b/target/i386/csv-sysemu-stub.c @@ -0,0 +1,51 @@ +/* + * QEMU CSV system stub + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "sev.h" +#include "csv.h" + +int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) +{ + return 0; +} + +int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) +{ + g_assert_not_reached(); +} + +int csv3_launch_encrypt_vmcb(void) +{ + g_assert_not_reached(); +} + +int csv3_shared_region_dma_map(uint64_t start, uint64_t end) +{ + return 0; +} + +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) +{ + +} + +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages) +{ + +} + +int csv3_set_guest_private_memory(Error **errp) +{ + g_assert_not_reached(); +} diff --git a/target/i386/csv.c b/target/i386/csv.c new file mode 100644 index 0000000000000000000000000000000000000000..b229f7c317789abb5f72ea5f7a959ed259c915a7 --- /dev/null +++ b/target/i386/csv.c @@ -0,0 +1,757 @@ +/* + * QEMU CSV support + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "sysemu/kvm.h" +#include "exec/address-spaces.h" +#include "migration/blocker.h" +#include "migration/qemu-file.h" +#include "migration/misc.h" +#include "monitor/monitor.h" + +#include +#include + +#ifdef CONFIG_NUMA +#include +#endif + +#include "trace.h" +#include "cpu.h" +#include "sev.h" +#include "csv.h" + +bool csv_kvm_cpu_reset_inhibit; +uint32_t kvm_hygon_coco_ext; +uint32_t kvm_hygon_coco_ext_inuse; + +struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops = { + .save_setup = sev_save_setup, + .save_outgoing_page = NULL, + .load_incoming_page = csv3_load_incoming_page, + .is_gfn_in_unshared_region = NULL, + .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, + .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, + .queue_outgoing_page = csv3_queue_outgoing_page, + .save_queued_outgoing_pages = csv3_save_queued_outgoing_pages, + .queue_incoming_page = NULL, + .load_queued_incoming_pages = NULL, + .save_outgoing_cpu_state = csv3_save_outgoing_context, + .load_incoming_cpu_state = csv3_load_incoming_context, +}; + +#define CSV3_OUTGOING_PAGE_NUM \ + (CSV3_OUTGOING_PAGE_WINDOW_SIZE / TARGET_PAGE_SIZE) + +Csv3GuestState csv3_guest = { 0 }; + +int +csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops) +{ + int fw_error; + int ret; + struct kvm_csv3_init_data data = { 0 }; + +#ifdef CONFIG_NUMA + int mode; + unsigned long nodemask; + + /* Set flags as 0 to retrieve the default NUMA policy. */ + ret = get_mempolicy(&mode, &nodemask, sizeof(nodemask) * 8, NULL, 0); + if (ret == 0 && mode == MPOL_BIND) + data.nodemask = nodemask; +#endif + + if (!ops || !ops->sev_ioctl || !ops->fw_error_to_str) + return -1; + + csv3_guest.policy = policy; + if (csv3_enabled()) { + ret = ops->sev_ioctl(fd, KVM_CSV3_INIT, &data, &fw_error); + if (ret) { + csv3_guest.policy = 0; + error_report("%s: Fail to initialize ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, ops->fw_error_to_str(fw_error)); + return -1; + } + + kvm_csv3_allowed = true; + + csv3_guest.sev_fd = fd; + csv3_guest.state = state; + csv3_guest.sev_ioctl = ops->sev_ioctl; + csv3_guest.fw_error_to_str = ops->fw_error_to_str; + QTAILQ_INIT(&csv3_guest.dma_map_regions_list); + qemu_mutex_init(&csv3_guest.dma_map_regions_list_mutex); + csv3_guest.sev_send_start = ops->sev_send_start; + csv3_guest.sev_receive_start = ops->sev_receive_start; + } + return 0; +} + +bool +csv3_enabled(void) +{ + if (!is_hygon_cpu()) + return false; + + return sev_es_enabled() && (csv3_guest.policy & GUEST_POLICY_CSV3_BIT); +} + +static bool +csv3_check_state(SevState state) +{ + return *((SevState *)csv3_guest.state) == state; +} + +static int +csv3_ioctl(int cmd, void *data, int *error) +{ + if (csv3_guest.sev_ioctl) + return csv3_guest.sev_ioctl(csv3_guest.sev_fd, cmd, data, error); + else + return -1; +} + +static const char * +fw_error_to_str(int code) +{ + if (csv3_guest.fw_error_to_str) + return csv3_guest.fw_error_to_str(code); + else + return NULL; +} + +static int +csv3_launch_encrypt_data(uint64_t gpa, uint8_t *addr, uint64_t len) +{ + int ret, fw_error; + struct kvm_csv3_launch_encrypt_data update; + + if (!addr || !len) { + return 1; + } + + update.gpa = (__u64)gpa; + update.uaddr = (__u64)(unsigned long)addr; + update.len = len; + trace_kvm_csv3_launch_encrypt_data(gpa, addr, len); + ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("%s: CSV3 LAUNCH_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + return ret; +} + +int +csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp) +{ + int ret = 0; + + if (!csv3_enabled()) { + error_setg(errp, "%s: CSV3 is not enabled", __func__); + return -1; + } + + /* if CSV3 is in update state then load the data to secure memory */ + if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) { + ret = csv3_launch_encrypt_data(gpa, ptr, len); + if (ret) + error_setg(errp, "%s: CSV3 fail to encrypt data", __func__); + } + + return ret; +} + +int +csv3_launch_encrypt_vmcb(void) +{ + int ret, fw_error; + + if (!csv3_enabled()) { + error_report("%s: CSV3 is not enabled", __func__); + return -1; + } + + ret = csv3_ioctl(KVM_CSV3_LAUNCH_ENCRYPT_VMCB, NULL, &fw_error); + if (ret) { + error_report("%s: CSV3 LAUNCH_ENCRYPT_VMCB ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + return ret; +} + +int csv3_shared_region_dma_map(uint64_t start, uint64_t end) +{ + MemoryRegionSection section; + AddressSpace *as; + QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners; + SharedRegionListener *shl; + MemoryListener *listener; + uint64_t size; + Csv3GuestState *s = &csv3_guest; + struct dma_map_region *region, *pos; + int ret = 0; + + if (!csv3_enabled()) + return 0; + + if (end <= start) + return 0; + + shared_region_listeners = shared_region_listeners_get(); + if (QTAILQ_EMPTY(shared_region_listeners)) + return 0; + + size = end - start; + + qemu_mutex_lock(&s->dma_map_regions_list_mutex); + QTAILQ_FOREACH(pos, &s->dma_map_regions_list, list) { + if (start >= (pos->start + pos->size)) { + continue; + } else if ((start + size) <= pos->start) { + break; + } else { + goto end; + } + } + QTAILQ_FOREACH(shl, shared_region_listeners, next) { + listener = shl->listener; + as = shl->as; + section = memory_region_find(as->root, start, size); + if (!section.mr) { + goto end; + } + + if (!memory_region_is_ram(section.mr)) { + memory_region_unref(section.mr); + goto end; + } + + if (listener->region_add) { + listener->region_add(listener, §ion); + } + memory_region_unref(section.mr); + } + + region = g_malloc0(sizeof(*region)); + if (!region) { + ret = -1; + goto end; + } + region->start = start; + region->size = size; + + if (pos) { + QTAILQ_INSERT_BEFORE(pos, region, list); + } else { + QTAILQ_INSERT_TAIL(&s->dma_map_regions_list, region, list); + } + +end: + qemu_mutex_unlock(&s->dma_map_regions_list_mutex); + return ret; +} + +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages) +{ + struct kvm_csv3_handle_memory mem = { 0 }; + MemoryRegion *mr = NULL; + void *hva; + int ret; + + if (!csv3_enabled()) + return; + + if (!gpa || !num_pages) + return; + + mem.gpa = (__u64)gpa; + mem.num_pages = (__u32)num_pages; + mem.opcode = (__u32)KVM_CSV3_RELEASE_SHARED_MEMORY; + + /* unpin the pages */ + ret = csv3_ioctl(KVM_CSV3_HANDLE_MEMORY, &mem, NULL); + if (ret <= 0) { + if (ret < 0) + error_report("%s: CSV3 unpin failed ret %d", __func__, ret); + return; + } + + /* drop the pages */ + hva = gpa2hva(&mr, gpa, num_pages << TARGET_PAGE_BITS, NULL); + if (hva) { + ret = madvise(hva, num_pages << TARGET_PAGE_BITS, MADV_DONTNEED); + if (ret) + error_report("%s: madvise failed %d", __func__, ret); + } +} + +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end) +{ + MemoryRegionSection section; + AddressSpace *as; + QTAILQ_HEAD(, SharedRegionListener) *shared_region_listeners; + SharedRegionListener *shl; + MemoryListener *listener; + uint64_t size; + Csv3GuestState *s = &csv3_guest; + struct dma_map_region *pos, *next_pos; + + if (!csv3_enabled()) + return; + + if (end <= start) + return; + + shared_region_listeners = shared_region_listeners_get(); + if (QTAILQ_EMPTY(shared_region_listeners)) + return; + + size = end - start; + + qemu_mutex_lock(&s->dma_map_regions_list_mutex); + QTAILQ_FOREACH_SAFE(pos, &s->dma_map_regions_list, list, next_pos) { + uint64_t l, r; + uint64_t curr_end = pos->start + pos->size; + + l = MAX(start, pos->start); + r = MIN(start + size, pos->start + pos->size); + if (l < r) { + if ((start <= pos->start) && (start + size >= pos->start + pos->size)) { + QTAILQ_FOREACH(shl, shared_region_listeners, next) { + listener = shl->listener; + as = shl->as; + section = memory_region_find(as->root, pos->start, pos->size); + if (!section.mr) { + goto end; + } + if (listener->region_del) { + listener->region_del(listener, §ion); + } + memory_region_unref(section.mr); + } + + QTAILQ_REMOVE(&s->dma_map_regions_list, pos, list); + g_free(pos); + } + break; + } + if ((start + size) <= curr_end) { + break; + } + } +end: + qemu_mutex_unlock(&s->dma_map_regions_list_mutex); + return; +} + +static inline hwaddr csv3_hva_to_gfn(uint8_t *ptr) +{ + ram_addr_t offset = RAM_ADDR_INVALID; + + kvm_physical_memory_addr_from_host(kvm_state, ptr, &offset); + + return offset >> TARGET_PAGE_BITS; +} + +static int +csv3_send_start(QEMUFile *f, uint64_t *bytes_sent) +{ + if (csv3_guest.sev_send_start) + return csv3_guest.sev_send_start(f, bytes_sent); + else + return -1; +} + +static int +csv3_send_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_csv3_send_encrypt_data update = {0}; + + update.hdr_len = 0; + update.trans_len = 0; + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + ret = 0; + goto err; + } + + if (update.hdr_len <= INT_MAX) + ret = update.hdr_len; + else + ret = 0; + +err: + return ret; +} + +static int +csv3_send_encrypt_data(Csv3GuestState *s, QEMUFile *f, + uint8_t *ptr, uint32_t size, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + guchar *trans; + uint32_t guest_addr_entry_num; + uint32_t i; + struct kvm_csv3_send_encrypt_data update = { }; + + /* + * If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (!s->send_packet_hdr) { + s->send_packet_hdr_len = csv3_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len); + } + + if (!s->guest_addr_len || !s->guest_addr_data) { + error_report("%s: invalid host address or size", __func__); + return 1; + } else { + guest_addr_entry_num = s->guest_addr_len / sizeof(struct guest_addr_entry); + } + + /* allocate transport buffer */ + trans = g_new(guchar, guest_addr_entry_num * TARGET_PAGE_SIZE); + + update.hdr_uaddr = (uintptr_t)s->send_packet_hdr; + update.hdr_len = s->send_packet_hdr_len; + update.guest_addr_data = (uintptr_t)s->guest_addr_data; + update.guest_addr_len = s->guest_addr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = guest_addr_entry_num * TARGET_PAGE_SIZE; + + trace_kvm_csv3_send_encrypt_data(trans, update.trans_len); + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + for (i = 0; i < guest_addr_entry_num; i++) { + if (s->guest_addr_data[i].share) + memcpy(trans + i * TARGET_PAGE_SIZE, (guchar *)s->guest_hva_data[i].hva, + TARGET_PAGE_SIZE); + } + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.guest_addr_len); + qemu_put_buffer(f, (uint8_t *)update.guest_addr_data, update.guest_addr_len); + *bytes_sent += 4 + update.guest_addr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += (4 + update.trans_len); + +err: + s->guest_addr_len = 0; + g_free(trans); + return ret; +} + +int +csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) +{ + Csv3GuestState *s = &csv3_guest; + uint32_t i = 0; + + if (!s->guest_addr_data) { + s->guest_hva_data = g_new0(struct guest_hva_entry, CSV3_OUTGOING_PAGE_NUM); + s->guest_addr_data = g_new0(struct guest_addr_entry, CSV3_OUTGOING_PAGE_NUM); + s->guest_addr_len = 0; + } + + if (s->guest_addr_len >= sizeof(struct guest_addr_entry) * CSV3_OUTGOING_PAGE_NUM) { + error_report("Failed to queue outgoing page"); + return 1; + } + + i = s->guest_addr_len / sizeof(struct guest_addr_entry); + s->guest_hva_data[i].hva = (uintptr_t)ptr; + s->guest_addr_data[i].share = 0; + s->guest_addr_data[i].reserved = 0; + s->guest_addr_data[i].gfn = csv3_hva_to_gfn(ptr); + s->guest_addr_len += sizeof(struct guest_addr_entry); + + return 0; +} + +int +csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) +{ + Csv3GuestState *s = &csv3_guest; + + /* + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. + */ + if (!csv3_check_state(SEV_STATE_SEND_UPDATE) && + csv3_send_start(f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; + } + + return csv3_send_encrypt_data(s, f, NULL, 0, bytes_sent); +} + +static int +csv3_receive_start(QEMUFile *f) +{ + if (csv3_guest.sev_receive_start) + return csv3_guest.sev_receive_start(f); + else + return -1; +} + +static int csv3_receive_encrypt_data(QEMUFile *f, uint8_t *ptr) +{ + int ret = 1, fw_error = 0; + uint32_t i, guest_addr_entry_num; + gchar *hdr = NULL, *trans = NULL; + struct guest_addr_entry *guest_addr_data; + struct kvm_csv3_receive_encrypt_data update = {}; + void *hva = NULL; + MemoryRegion *mr = NULL; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get guest addr data */ + update.guest_addr_len = qemu_get_be32(f); + + guest_addr_data = (struct guest_addr_entry *)g_new(gchar, update.guest_addr_len); + qemu_get_buffer(f, (uint8_t *)guest_addr_data, update.guest_addr_len); + update.guest_addr_data = (uintptr_t)guest_addr_data; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + /* update share memory. */ + guest_addr_entry_num = update.guest_addr_len / sizeof(struct guest_addr_entry); + for (i = 0; i < guest_addr_entry_num; i++) { + if (guest_addr_data[i].share) { + hva = gpa2hva(&mr, + ((uint64_t)guest_addr_data[i].gfn << TARGET_PAGE_BITS), + TARGET_PAGE_SIZE, + NULL); + if (hva) + memcpy(hva, trans + i * TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); + } + } + + trace_kvm_csv3_receive_encrypt_data(trans, update.trans_len, hdr, update.hdr_len); + + ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_DATA, &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_ENCRYPT_DATA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + g_free(trans); + g_free(guest_addr_data); + g_free(hdr); + return ret; +} + +int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!csv3_check_state(SEV_STATE_RECEIVE_UPDATE) && + csv3_receive_start(f)) { + return 1; + } + + return csv3_receive_encrypt_data(f, ptr); +} + +static int +csv3_send_get_context_len(int *fw_err, int *context_len, int *hdr_len) +{ + int ret = 0; + struct kvm_csv3_send_encrypt_context update = { 0 }; + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + error_report("%s: failed to get context length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + ret = -1; + goto err; + } + + if (update.trans_len <= INT_MAX && update.hdr_len <= INT_MAX) { + *context_len = update.trans_len; + *hdr_len = update.hdr_len; + } + ret = 0; +err: + return ret; +} + +static int +csv3_send_encrypt_context(Csv3GuestState *s, QEMUFile *f, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + int context_len = 0; + int hdr_len = 0; + guchar *trans; + guchar *hdr; + struct kvm_csv3_send_encrypt_context update = { }; + + ret = csv3_send_get_context_len(&fw_error, &context_len, &hdr_len); + if (context_len < 1 || hdr_len < 1) { + error_report("%s: fail to get context length fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + /* allocate transport buffer */ + trans = g_new(guchar, context_len); + hdr = g_new(guchar, hdr_len); + + update.hdr_uaddr = (uintptr_t)hdr; + update.hdr_len = hdr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = context_len; + + trace_kvm_csv3_send_encrypt_context(trans, update.trans_len); + + ret = csv3_ioctl(KVM_CSV3_SEND_ENCRYPT_CONTEXT, &update, &fw_error); + if (ret) { + error_report("%s: SEND_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += 4 + update.trans_len; + +err: + g_free(trans); + g_free(hdr); + return ret; +} + +static int +csv3_receive_encrypt_context(Csv3GuestState *s, QEMUFile *f) +{ + int ret = 1, fw_error = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_csv3_receive_encrypt_context update = {}; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + trace_kvm_csv3_receive_encrypt_context(trans, update.trans_len, hdr, update.hdr_len); + + ret = csv3_ioctl(KVM_CSV3_RECEIVE_ENCRYPT_CONTEXT, &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_ENCRYPT_CONTEXT ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent) +{ + Csv3GuestState *s = &csv3_guest; + + /* send csv3 context. */ + return csv3_send_encrypt_context(s, f, bytes_sent); +} + +int csv3_load_incoming_context(QEMUFile *f) +{ + Csv3GuestState *s = &csv3_guest; + + /* receive csv3 context. */ + return csv3_receive_encrypt_context(s, f); +} + +int csv3_set_guest_private_memory(Error **errp) +{ + int fw_error; + int ret = 0; + + if (!csv3_enabled()) { + error_setg(errp, "%s: CSV3 is not enabled", __func__); + return -1; + } + + /* if CSV3 is in update state then load the data to secure memory */ + if (csv3_check_state(SEV_STATE_LAUNCH_UPDATE)) { + trace_kvm_csv3_set_guest_private_memory(); + ret = csv3_ioctl(KVM_CSV3_SET_GUEST_PRIVATE_MEMORY, NULL, &fw_error); + if (ret) + error_setg(errp, "%s: CSV3 fail set private memory, ret=%d" + " fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + return ret; +} diff --git a/target/i386/csv.h b/target/i386/csv.h new file mode 100644 index 0000000000000000000000000000000000000000..70f9933d3b29d83b5e88bf4b17b26e8df4ca0986 --- /dev/null +++ b/target/i386/csv.h @@ -0,0 +1,136 @@ +/* + * QEMU CSV support + * + * Copyright: Hygon Info Technologies Ltd. 2022 + * + * Author: + * Jiang Xin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef I386_CSV_H +#define I386_CSV_H + +#include "qapi/qapi-commands-misc-target.h" +#include "qemu/thread.h" +#include "qemu/queue.h" +#include "sev.h" + +#define GUEST_POLICY_CSV3_BIT (1 << 6) +#define GUEST_POLICY_REUSE_ASID (1 << 7) + +#ifdef CONFIG_CSV + +#include "cpu.h" + +#define CPUID_VENDOR_HYGON_EBX 0x6f677948 /* "Hygo" */ +#define CPUID_VENDOR_HYGON_ECX 0x656e6975 /* "uine" */ +#define CPUID_VENDOR_HYGON_EDX 0x6e65476e /* "nGen" */ + +static bool __attribute__((unused)) is_hygon_cpu(void) +{ + uint32_t ebx = 0; + uint32_t ecx = 0; + uint32_t edx = 0; + + host_cpuid(0, 0, NULL, &ebx, &ecx, &edx); + + if (ebx == CPUID_VENDOR_HYGON_EBX && + ecx == CPUID_VENDOR_HYGON_ECX && + edx == CPUID_VENDOR_HYGON_EDX) + return true; + else + return false; +} + +bool csv3_enabled(void); + +#else + +#define is_hygon_cpu() (false) +#define csv3_enabled() (false) + +#endif + +#define CSV_OUTGOING_PAGE_WINDOW_SIZE (4094 * TARGET_PAGE_SIZE) + +extern bool csv_kvm_cpu_reset_inhibit; +extern uint32_t kvm_hygon_coco_ext; +extern uint32_t kvm_hygon_coco_ext_inuse; + +typedef struct CsvBatchCmdList CsvBatchCmdList; +typedef void (*CsvDestroyCmdNodeFn) (void *data); + +struct CsvBatchCmdList { + struct kvm_csv_batch_list_node *head; + struct kvm_csv_batch_list_node *tail; + CsvDestroyCmdNodeFn destroy_fn; +}; + +int csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); +int csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); +int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr); +int csv_load_queued_incoming_pages(QEMUFile *f); +int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent); +int csv_load_incoming_cpu_state(QEMUFile *f); + +/* CSV3 */ +struct dma_map_region { + uint64_t start, size; + QTAILQ_ENTRY(dma_map_region) list; +}; + +#define CSV3_OUTGOING_PAGE_WINDOW_SIZE (512 * TARGET_PAGE_SIZE) + +struct guest_addr_entry { + uint64_t share: 1; + uint64_t reserved: 11; + uint64_t gfn: 52; +}; + +struct guest_hva_entry { + uint64_t hva; +}; + +struct Csv3GuestState { + uint32_t policy; + int sev_fd; + void *state; + int (*sev_ioctl)(int fd, int cmd, void *data, int *error); + const char *(*fw_error_to_str)(int code); + QTAILQ_HEAD(, dma_map_region) dma_map_regions_list; + QemuMutex dma_map_regions_list_mutex; + gchar *send_packet_hdr; + size_t send_packet_hdr_len; + struct guest_hva_entry *guest_hva_data; + struct guest_addr_entry *guest_addr_data; + size_t guest_addr_len; + + int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); + int (*sev_receive_start)(QEMUFile *f); +}; + +typedef struct Csv3GuestState Csv3GuestState; + +extern struct Csv3GuestState csv3_guest; +extern struct ConfidentialGuestMemoryEncryptionOps csv3_memory_encryption_ops; +extern int csv3_init(uint32_t policy, int fd, void *state, struct sev_ops *ops); +extern int csv3_launch_encrypt_vmcb(void); + +int csv3_load_data(uint64_t gpa, uint8_t *ptr, uint64_t len, Error **errp); + +int csv3_shared_region_dma_map(uint64_t start, uint64_t end); +void csv3_shared_region_dma_unmap(uint64_t start, uint64_t end); +void csv3_shared_region_release(uint64_t gpa, uint32_t num_pages); +int csv3_load_incoming_page(QEMUFile *f, uint8_t *ptr); +int csv3_load_incoming_context(QEMUFile *f); +int csv3_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr); +int csv3_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent); +int csv3_save_outgoing_context(QEMUFile *f, uint64_t *bytes_sent); + +int csv3_set_guest_private_memory(Error **errp); + +#endif diff --git a/target/i386/helper.c b/target/i386/helper.c index 2070dd0dda1f4c8e740698779cd79a038c42ffe1..1da7a7d315c3e61cdfc4d97526834db3eb566661 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -219,6 +219,10 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) new_cr4 &= ~CR4_PKS_MASK; } + if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) { + new_cr4 &= ~CR4_LAM_SUP_MASK; + } + env->cr[4] = new_cr4; env->hflags = hflags; diff --git a/target/i386/host-cpu.c b/target/i386/host-cpu.c index 92ecb7254b8363de8c82f4800573ba990381e2ce..07738bf85740e8785a207cbd32cb6744eb78a699 100644 --- a/target/i386/host-cpu.c +++ b/target/i386/host-cpu.c @@ -13,6 +13,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/sysemu.h" +#include "hw/boards.h" /* Note: Only safe for use on x86(-64) hosts */ static uint32_t host_cpu_phys_bits(void) @@ -57,14 +58,14 @@ static uint32_t host_cpu_adjust_phys_bits(X86CPU *cpu) uint32_t phys_bits = cpu->phys_bits; static bool warned; - /* - * Print a warning if the user set it to a value that's not the - * host value. - */ - if (phys_bits != host_phys_bits && phys_bits != 0 && + /* adjust x86 cpu phys_bits according to ram_size. */ + x86_cpu_adjuest_by_ram_size(current_machine->ram_size, cpu); + + /* Print a warning if the host value less than the user set. */ + if (phys_bits > host_phys_bits && phys_bits != 0 && !warned) { warn_report("Host physical bits (%u)" - " does not match phys-bits property (%u)", + " less than phys-bits property (%u)", host_phys_bits, phys_bits); warned = true; } diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c index f09bfbdda5bebbdfc21d5a6a0a8c6d46029d29fd..cdea2ea69d97486dd4b282bc2469c13f66906444 100644 --- a/target/i386/hvf/x86_task.c +++ b/target/i386/hvf/x86_task.c @@ -122,7 +122,6 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea load_regs(cpu); struct x86_segment_descriptor curr_tss_desc, next_tss_desc; - int ret; x68_segment_selector old_tss_sel = vmx_read_segment_selector(cpu, R_TR); uint64_t old_tss_base = vmx_read_segment_base(cpu, R_TR); uint32_t desc_limit; @@ -138,7 +137,7 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea if (reason == TSR_IDT_GATE && gate_valid) { int dpl; - ret = x86_read_call_gate(cpu, &task_gate_desc, gate); + x86_read_call_gate(cpu, &task_gate_desc, gate); dpl = task_gate_desc.dpl; x68_segment_selector cs = vmx_read_segment_selector(cpu, R_CS); @@ -167,11 +166,12 @@ void vmx_handle_task_switch(CPUState *cpu, x68_segment_selector tss_sel, int rea x86_write_segment_descriptor(cpu, &next_tss_desc, tss_sel); } - if (next_tss_desc.type & 8) - ret = task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); - else + if (next_tss_desc.type & 8) { + task_switch_32(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); + } else { //ret = task_switch_16(cpu, tss_sel, old_tss_sel, old_tss_base, &next_tss_desc); VM_PANIC("task_switch_16"); + } macvm_set_cr0(cpu->accel->fd, rvmcs(cpu->accel->fd, VMCS_GUEST_CR0) | CR0_TS_MASK); diff --git a/target/i386/kvm/csv-stub.c b/target/i386/kvm/csv-stub.c new file mode 100644 index 0000000000000000000000000000000000000000..8662d33206076c6686fb0e3bb65e6b4eba60ad61 --- /dev/null +++ b/target/i386/kvm/csv-stub.c @@ -0,0 +1,19 @@ +/* + * QEMU CSV stub + * + * Copyright Hygon Info Technologies Ltd. 2024 + * + * Authors: + * Han Liyang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "csv.h" + +bool csv_kvm_cpu_reset_inhibit; +uint32_t kvm_hygon_coco_ext; +uint32_t kvm_hygon_coco_ext_inuse; diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index 9c791b7b0520089c15404eb1a2826c932b80d1ee..a3bc8d8f832c1358270846b473769d872a14463b 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -18,10 +18,32 @@ #include "kvm_i386.h" #include "hw/core/accel-cpu.h" +static void kvm_set_guest_phys_bits(CPUState *cs) +{ + X86CPU *cpu = X86_CPU(cs); + uint32_t eax, guest_phys_bits; + + eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x80000008, 0, R_EAX); + guest_phys_bits = (eax >> 16) & 0xff; + if (!guest_phys_bits) { + return; + } + cpu->guest_phys_bits = guest_phys_bits; + if (cpu->guest_phys_bits > cpu->phys_bits) { + cpu->guest_phys_bits = cpu->phys_bits; + } + + if (cpu->host_phys_bits && cpu->host_phys_bits_limit && + cpu->guest_phys_bits > cpu->host_phys_bits_limit) { + cpu->guest_phys_bits = cpu->host_phys_bits_limit; + } +} + static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + bool ret; /* * The realize order is important, since x86_cpu_realize() checks if @@ -32,13 +54,15 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) * * realize order: * - * x86_cpu_realize(): - * -> x86_cpu_expand_features() - * -> cpu_exec_realizefn(): - * -> accel_cpu_common_realize() - * kvm_cpu_realizefn() -> host_cpu_realizefn() - * -> cpu_common_realizefn() - * -> check/update ucode_rev, phys_bits, mwait + * x86_cpu_realizefn(): + * x86_cpu_expand_features() + * cpu_exec_realizefn(): + * accel_cpu_common_realize() + * kvm_cpu_realizefn() + * host_cpu_realizefn() + * kvm_set_guest_phys_bits() + * check/update ucode_rev, phys_bits, guest_phys_bits, mwait + * cpu_common_realizefn() (via xcc->parent_realize) */ if (cpu->max_features) { if (enable_cpu_pm && kvm_has_waitpkg()) { @@ -50,7 +74,17 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp) MSR_IA32_UCODE_REV); } } - return host_cpu_realizefn(cs, errp); + ret = host_cpu_realizefn(cs, errp); + if (!ret) { + return ret; + } + + if ((env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) && + cpu->guest_phys_bits == -1) { + kvm_set_guest_phys_bits(cs); + } + + return true; } static bool lmce_supported(void) @@ -103,7 +137,7 @@ static void kvm_cpu_xsave_init(void) if (!esa->size) { continue; } - if ((x86_cpu_get_supported_feature_word(esa->feature, false) & esa->bits) + if ((x86_cpu_get_supported_feature_word(NULL, esa->feature) & esa->bits) != esa->bits) { continue; } diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 4ce80555b45c14fe8f683c1ee6c5bde5db216a9b..3a88e65635698ec3043a90b05cd984a2e93e7a58 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -32,6 +32,7 @@ #include "sysemu/runstate.h" #include "kvm_i386.h" #include "sev.h" +#include "csv.h" #include "xen-emu.h" #include "hyperv.h" #include "hyperv-proto.h" @@ -148,6 +149,7 @@ static int has_xcrs; static int has_sregs2; static int has_exception_payload; static int has_triple_fault_event; +static int has_map_gpa_range; static bool has_msr_mcg_ext_ctl; @@ -1894,10 +1896,12 @@ int kvm_arch_init_vcpu(CPUState *cs) int times; c->function = i; - c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | - KVM_CPUID_FLAG_STATE_READ_NEXT; cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); times = c->eax & 0xff; + if (times > 1) { + c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | + KVM_CPUID_FLAG_STATE_READ_NEXT; + } for (j = 1; j < times; ++j) { if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { @@ -1914,6 +1918,7 @@ int kvm_arch_init_vcpu(CPUState *cs) } case 0x1f: if (env->nr_dies < 2) { + cpuid_i--; break; } /* fallthrough */ @@ -1921,14 +1926,6 @@ int kvm_arch_init_vcpu(CPUState *cs) case 0xb: case 0xd: for (j = 0; ; j++) { - if (i == 0xd && j == 64) { - break; - } - - if (i == 0x1f && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; @@ -1944,7 +1941,12 @@ int kvm_arch_init_vcpu(CPUState *cs) break; } if (i == 0xd && c->eax == 0) { - continue; + if (j < 63) { + continue; + } else { + cpuid_i--; + break; + } } if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { fprintf(stderr, "cpuid_data is full, no space for " @@ -1954,7 +1956,6 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; - case 0x7: case 0x12: for (j = 0; ; j++) { c->function = i; @@ -1974,6 +1975,7 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; + case 0x7: case 0x14: case 0x1d: case 0x1e: { @@ -2190,6 +2192,17 @@ int kvm_arch_init_vcpu(CPUState *cs) c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10); } + if (sev_enabled()) { + c = cpuid_find_entry(&cpuid_data.cpuid, + KVM_CPUID_FEATURES | kvm_base, 0); + if (c) { + c->eax |= (1 << KVM_FEATURE_MIGRATION_CONTROL); + if (has_map_gpa_range) { + c->eax |= (1 << KVM_FEATURE_HC_MAP_GPA_RANGE); + } + } + } + cpuid_data.cpuid.nent = cpuid_i; cpuid_data.cpuid.padding = 0; @@ -2257,6 +2270,11 @@ void kvm_arch_reset_vcpu(X86CPU *cpu) env->mp_state = KVM_MP_STATE_RUNNABLE; } + if (cpu_is_bsp(cpu) && + sev_enabled() && has_map_gpa_range) { + sev_remove_shared_regions_list(0, -1); + } + /* enabled by default */ env->poll_control_msr = 1; @@ -2475,6 +2493,32 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr, return true; } +/* + * Currently this exit is only used by SEV guests for + * MSR_KVM_MIGRATION_CONTROL to indicate if the guest + * is ready for migration. + */ +static uint64_t msr_kvm_migration_control; + +static bool kvm_rdmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr, + uint64_t *val) +{ + *val = msr_kvm_migration_control; + + return true; +} + +static bool kvm_wrmsr_kvm_migration_control(X86CPU *cpu, uint32_t msr, + uint64_t val) +{ + msr_kvm_migration_control = val; + + if (val == KVM_MIGRATION_READY) + sev_del_migrate_blocker(); + + return true; +} + static Notifier smram_machine_done; static KVMMemoryListener smram_listener; static AddressSpace smram_address_space; @@ -2583,6 +2627,34 @@ int kvm_arch_init(MachineState *ms, KVMState *s) #endif } + has_map_gpa_range = kvm_check_extension(s, KVM_CAP_EXIT_HYPERCALL); + if (has_map_gpa_range) { + ret = kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, + KVM_EXIT_HYPERCALL_VALID_MASK); + if (ret < 0) { + error_report("kvm: Failed to enable MAP_GPA_RANGE cap: %s", + strerror(-ret)); + return ret; + } + } + + if (is_hygon_cpu()) { + /* check and enable Hygon coco extensions */ + kvm_hygon_coco_ext = (uint32_t)kvm_vm_check_extension(s, + KVM_CAP_HYGON_COCO_EXT); + if (kvm_hygon_coco_ext) { + ret = kvm_vm_enable_cap(s, KVM_CAP_HYGON_COCO_EXT, 0, + (uint64_t)kvm_hygon_coco_ext); + if (ret == -EINVAL) { + error_report("kvm: Failed to enable KVM_CAP_HYGON_COCO_EXT cap: %s", + strerror(-ret)); + kvm_hygon_coco_ext_inuse = 0; + } else { + kvm_hygon_coco_ext_inuse = (uint32_t)ret; + } + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; @@ -2711,6 +2783,15 @@ int kvm_arch_init(MachineState *ms, KVMState *s) strerror(-ret)); exit(1); } + + r = kvm_filter_msr(s, MSR_KVM_MIGRATION_CONTROL, + kvm_rdmsr_kvm_migration_control, + kvm_wrmsr_kvm_migration_control); + if (!r) { + error_report("Could not install MSR_KVM_MIGRATION_CONTROL handler: %s", + strerror(-ret)); + exit(1); + } } return 0; @@ -3172,7 +3253,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, CR4_VMXE_MASK); - if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { + if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + /* FRED injected-event data (0x2052). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); + } else if (f[FEAT_VMX_EXIT_CTLS] & + VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { + /* Secondary VM-exit controls (0x2044). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); + } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { /* TSC multiplier (0x2032). */ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); } else { @@ -3309,6 +3397,17 @@ static int kvm_put_msrs(X86CPU *cpu, int level) kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); + if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config); + } } #endif @@ -3561,6 +3660,10 @@ static int kvm_put_msrs(X86CPU *cpu, int level) } } + if (sev_kvm_has_msr_ghcb) { + kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, env->ghcb_gpa); + } + return kvm_buf_set_msrs(cpu); } @@ -3781,6 +3884,17 @@ static int kvm_get_msrs(X86CPU *cpu) kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); kvm_msr_entry_add(cpu, MSR_FMASK, 0); kvm_msr_entry_add(cpu, MSR_LSTAR, 0); + if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0); + kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0); + } } #endif kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); @@ -3935,6 +4049,10 @@ static int kvm_get_msrs(X86CPU *cpu) } } + if (sev_kvm_has_msr_ghcb) { + kvm_msr_entry_add(cpu, MSR_AMD64_SEV_ES_GHCB, 0); + } + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); if (ret < 0) { return ret; @@ -4002,6 +4120,33 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_LSTAR: env->lstar = msrs[i].data; break; + case MSR_IA32_FRED_RSP0: + env->fred_rsp0 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP1: + env->fred_rsp1 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP2: + env->fred_rsp2 = msrs[i].data; + break; + case MSR_IA32_FRED_RSP3: + env->fred_rsp3 = msrs[i].data; + break; + case MSR_IA32_FRED_STKLVLS: + env->fred_stklvls = msrs[i].data; + break; + case MSR_IA32_FRED_SSP1: + env->fred_ssp1 = msrs[i].data; + break; + case MSR_IA32_FRED_SSP2: + env->fred_ssp2 = msrs[i].data; + break; + case MSR_IA32_FRED_SSP3: + env->fred_ssp3 = msrs[i].data; + break; + case MSR_IA32_FRED_CONFIG: + env->fred_config = msrs[i].data; + break; #endif case MSR_IA32_TSC: env->tsc = msrs[i].data; @@ -4255,6 +4400,9 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; break; + case MSR_AMD64_SEV_ES_GHCB: + env->ghcb_gpa = msrs[i].data; + break; } } @@ -4935,6 +5083,31 @@ static int kvm_handle_tpr_access(X86CPU *cpu) return 1; } +static int kvm_handle_exit_hypercall(X86CPU *cpu, struct kvm_run *run) +{ + /* + * Currently this exit is only used by SEV guests for + * guest page encryption status tracking. + */ + if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) { + unsigned long enc = run->hypercall.args[2]; + unsigned long gpa = run->hypercall.args[0]; + unsigned long npages = run->hypercall.args[1]; + unsigned long gfn_start = gpa >> TARGET_PAGE_BITS; + unsigned long gfn_end = gfn_start + npages; + + if (enc) { + sev_remove_shared_regions_list(gfn_start, gfn_end); + csv3_shared_region_dma_unmap(gpa, gfn_end << TARGET_PAGE_BITS); + csv3_shared_region_release(gpa, npages); + } else { + sev_add_shared_regions_list(gfn_start, gfn_end); + csv3_shared_region_dma_map(gpa, gfn_end << TARGET_PAGE_BITS); + } + } + return 0; +} + int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) { static const uint8_t int3 = 0xcc; @@ -5358,6 +5531,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = kvm_xen_handle_exit(cpu, &run->xen); break; #endif + case KVM_EXIT_HYPERCALL: + ret = kvm_handle_exit_hypercall(cpu, run); + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; @@ -5524,9 +5700,11 @@ void kvm_update_msi_routes_all(void *private, bool global, { int cnt = 0, vector; MSIRouteEntry *entry; + KVMRouteChange c; MSIMessage msg; PCIDevice *dev; + c = kvm_irqchip_begin_route_changes(kvm_state); /* TODO: explicit route update */ QLIST_FOREACH(entry, &msi_route_list, list) { cnt++; @@ -5543,9 +5721,9 @@ void kvm_update_msi_routes_all(void *private, bool global, */ continue; } - kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); + kvm_irqchip_update_msi_route(&c, entry->virq, msg, dev); } - kvm_irqchip_commit_routes(kvm_state); + kvm_irqchip_commit_route_changes(&c); trace_kvm_x86_update_msi_routes(cnt); } @@ -5610,6 +5788,9 @@ bool kvm_has_waitpkg(void) bool kvm_arch_cpu_check_are_resettable(void) { + if (is_hygon_cpu()) + return !csv_kvm_cpu_reset_inhibit; + return !sev_es_enabled(); } diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build index 84d9143e60296d4388a57184b29693231e01205c..3c3f8cf93c1bfe17638beae51942163fc7f4003d 100644 --- a/target/i386/kvm/meson.build +++ b/target/i386/kvm/meson.build @@ -8,6 +8,7 @@ i386_kvm_ss.add(files( i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c')) i386_kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c')) +i386_kvm_ss.add(when: 'CONFIG_CSV', if_false: files('csv-stub.c')) i386_system_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c'), if_false: files('hyperv-stub.c')) diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c index 1be5341e8a6a0e82997d88679f9cecce025d9e71..a0aac1117fbaca51154fc85553ea242cbc8cd647 100644 --- a/target/i386/kvm/sev-stub.c +++ b/target/i386/kvm/sev-stub.c @@ -14,8 +14,25 @@ #include "qemu/osdep.h" #include "sev.h" +bool sev_kvm_has_msr_ghcb; + int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { /* If we get here, cgs must be some non-SEV thing */ return 0; } + +int sev_remove_shared_regions_list(unsigned long gfn_start, + unsigned long gfn_end) +{ + return 0; +} + +int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end) +{ + return 0; +} + +void sev_del_migrate_blocker(void) +{ +} diff --git a/target/i386/machine.c b/target/i386/machine.c index a1041ef828cb2670e913738a44853b7eeadd1e7e..7cbfbc0efb051c899d59582f82eca8975e1d20bf 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1544,6 +1544,33 @@ static const VMStateDescription vmstate_msr_xfd = { }; #ifdef TARGET_X86_64 +static bool intel_fred_msrs_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return !!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED); +} + +static const VMStateDescription vmstate_msr_fred = { + .name = "cpu/fred", + .version_id = 1, + .minimum_version_id = 1, + .needed = intel_fred_msrs_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(env.fred_rsp0, X86CPU), + VMSTATE_UINT64(env.fred_rsp1, X86CPU), + VMSTATE_UINT64(env.fred_rsp2, X86CPU), + VMSTATE_UINT64(env.fred_rsp3, X86CPU), + VMSTATE_UINT64(env.fred_stklvls, X86CPU), + VMSTATE_UINT64(env.fred_ssp1, X86CPU), + VMSTATE_UINT64(env.fred_ssp2, X86CPU), + VMSTATE_UINT64(env.fred_ssp3, X86CPU), + VMSTATE_UINT64(env.fred_config, X86CPU), + VMSTATE_END_OF_LIST() + } + }; + static bool amx_xtile_needed(void *opaque) { X86CPU *cpu = opaque; @@ -1605,6 +1632,27 @@ static const VMStateDescription vmstate_triple_fault = { } }; +#if defined(CONFIG_KVM) && defined(TARGET_X86_64) +static bool msr_ghcb_gpa_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return env->ghcb_gpa != 0; +} + +static const VMStateDescription vmstate_msr_ghcb_gpa = { + .name = "cpu/svm_msr_ghcb_gpa", + .version_id = 1, + .minimum_version_id = 1, + .needed = msr_ghcb_gpa_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(env.ghcb_gpa, X86CPU), + VMSTATE_END_OF_LIST() + } +}; +#endif + const VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, @@ -1747,10 +1795,14 @@ const VMStateDescription vmstate_x86_cpu = { &vmstate_pdptrs, &vmstate_msr_xfd, #ifdef TARGET_X86_64 + &vmstate_msr_fred, &vmstate_amx_xtile, #endif &vmstate_arch_lbr, &vmstate_triple_fault, +#if defined(CONFIG_KVM) && defined(TARGET_X86_64) + &vmstate_msr_ghcb_gpa, +#endif NULL } }; diff --git a/target/i386/meson.build b/target/i386/meson.build index 7c74bfa8591b279e506f64451de406f9d718691d..594a0a6abf7bdc028fa7e94390890c2547ead7db 100644 --- a/target/i386/meson.build +++ b/target/i386/meson.build @@ -21,6 +21,7 @@ i386_system_ss.add(files( 'cpu-sysemu.c', )) i386_system_ss.add(when: 'CONFIG_SEV', if_true: files('sev.c'), if_false: files('sev-sysemu-stub.c')) +i386_system_ss.add(when: 'CONFIG_CSV', if_true: files('csv.c'), if_false: files('csv-sysemu-stub.c')) i386_user_ss = ss.source_set() diff --git a/target/i386/sev.c b/target/i386/sev.c index 9a71246682584da3204ce2723c0d26b830cb0b16..b4b42fd71650c05751983cd87ccf905abd9ea1bb 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -27,10 +27,13 @@ #include "crypto/hash.h" #include "sysemu/kvm.h" #include "sev.h" +#include "csv.h" #include "sysemu/sysemu.h" #include "sysemu/runstate.h" #include "trace.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" +#include "migration/misc.h" #include "qom/object.h" #include "monitor/monitor.h" #include "monitor/hmp-target.h" @@ -42,6 +45,11 @@ #define TYPE_SEV_GUEST "sev-guest" OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) +struct shared_region { + unsigned long gfn_start, gfn_end; + QTAILQ_ENTRY(shared_region) list; +}; + /** * SevGuestState: @@ -64,6 +72,9 @@ struct SevGuestState { uint32_t cbitpos; uint32_t reduced_phys_bits; bool kernel_hashes; + char *user_id; + char *secret_header_file; + char *secret_file; /* runtime state */ uint32_t handle; @@ -73,10 +84,27 @@ struct SevGuestState { int sev_fd; SevState state; gchar *measurement; + guchar *remote_pdh; + size_t remote_pdh_len; + guchar *remote_plat_cert; + size_t remote_plat_cert_len; + guchar *amd_cert; + size_t amd_cert_len; + gchar *send_packet_hdr; + size_t send_packet_hdr_len; + + /* needed by live migration of HYGON CSV2 guest */ + gchar *send_vmsa_packet_hdr; + size_t send_vmsa_packet_hdr_len; uint32_t reset_cs; uint32_t reset_ip; bool reset_data_valid; + + QTAILQ_HEAD(, shared_region) shared_regions_list; + + /* link list used for HYGON CSV */ + CsvBatchCmdList *csv_batch_cmd_list; }; #define DEFAULT_GUEST_POLICY 0x1 /* disable debug */ @@ -127,6 +155,8 @@ QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0); static SevGuestState *sev_guest; static Error *sev_mig_blocker; +bool sev_kvm_has_msr_ghcb; + static const char *const sev_fw_errlist[] = { [SEV_RET_SUCCESS] = "", [SEV_RET_INVALID_PLATFORM_STATE] = "Platform state is invalid", @@ -157,6 +187,29 @@ static const char *const sev_fw_errlist[] = { #define SEV_FW_MAX_ERROR ARRAY_SIZE(sev_fw_errlist) +#define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ + +#define SHARED_REGION_LIST_CONT 0x1 +#define SHARED_REGION_LIST_END 0x2 + +#define ENCRYPTED_CPU_STATE_CONT 0x1 +#define ENCRYPTED_CPU_STATE_END 0x2 + +static struct ConfidentialGuestMemoryEncryptionOps sev_memory_encryption_ops = { + .save_setup = sev_save_setup, + .save_outgoing_page = sev_save_outgoing_page, + .load_incoming_page = sev_load_incoming_page, + .is_gfn_in_unshared_region = sev_is_gfn_in_unshared_region, + .save_outgoing_shared_regions_list = sev_save_outgoing_shared_regions_list, + .load_incoming_shared_regions_list = sev_load_incoming_shared_regions_list, + .queue_outgoing_page = csv_queue_outgoing_page, + .save_queued_outgoing_pages = csv_save_queued_outgoing_pages, + .queue_incoming_page = csv_queue_incoming_page, + .load_queued_incoming_pages = csv_load_queued_incoming_pages, + .save_outgoing_cpu_state = csv_save_outgoing_cpu_state, + .load_incoming_cpu_state = csv_load_incoming_cpu_state, +}; + static int sev_ioctl(int fd, int cmd, void *data, int *error) { @@ -323,6 +376,54 @@ sev_guest_set_dh_cert_file(Object *obj, const char *value, Error **errp) s->dh_cert_file = g_strdup(value); } +static char * +sev_guest_get_user_id(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->user_id); +} + +static void +sev_guest_set_user_id(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->user_id = g_strdup(value); +} + +static char * +sev_guest_get_secret_header_file(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->secret_header_file); +} + +static void +sev_guest_set_secret_header_file(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->secret_header_file = g_strdup(value); +} + +static char * +sev_guest_get_secret_file(Object *obj, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + return g_strdup(s->secret_file); +} + +static void +sev_guest_set_secret_file(Object *obj, const char *value, Error **errp) +{ + SevGuestState *s = SEV_GUEST(obj); + + s->secret_file = g_strdup(value); +} + static char * sev_guest_get_sev_device(Object *obj, Error **errp) { @@ -376,6 +477,21 @@ sev_guest_class_init(ObjectClass *oc, void *data) sev_guest_set_kernel_hashes); object_class_property_set_description(oc, "kernel-hashes", "add kernel hashes to guest firmware for measured Linux boot"); + object_class_property_add_str(oc, "user-id", + sev_guest_get_user_id, + sev_guest_set_user_id); + object_class_property_set_description(oc, "user-id", + "user id of the guest owner"); + object_class_property_add_str(oc, "secret-header-file", + sev_guest_get_secret_header_file, + sev_guest_set_secret_header_file); + object_class_property_set_description(oc, "secret-header-file", + "header file of the guest owner's secret"); + object_class_property_add_str(oc, "secret-file", + sev_guest_get_secret_file, + sev_guest_set_secret_file); + object_class_property_set_description(oc, "secret-file", + "file of the guest owner's secret"); } static void @@ -539,7 +655,8 @@ static int sev_get_cpu0_id(int fd, guchar **id, size_t *id_len, Error **errp) /* query the ID length */ r = sev_platform_ioctl(fd, SEV_GET_ID2, &get_id2, &err); - if (r < 0 && err != SEV_RET_INVALID_LEN) { + if (r < 0 && err != SEV_RET_INVALID_LEN && + !(is_hygon_cpu() && err == SEV_RET_INVALID_PARAM)) { error_setg(errp, "SEV: Failed to get ID ret=%d fw_err=%d (%s)", r, err, fw_error_to_str(err)); return 1; @@ -794,6 +911,9 @@ sev_launch_update_vmsa(SevGuestState *sev) return ret; } +static int +csv_load_launch_secret(const char *secret_header_file, const char *secret_file); + static void sev_launch_get_measure(Notifier *notifier, void *unused) { @@ -807,8 +927,12 @@ sev_launch_get_measure(Notifier *notifier, void *unused) } if (sev_es_enabled()) { - /* measure all the VM save areas before getting launch_measure */ - ret = sev_launch_update_vmsa(sev); + if (csv3_enabled()) { + ret = csv3_launch_encrypt_vmcb(); + } else { + /* measure all the VM save areas before getting launch_measure */ + ret = sev_launch_update_vmsa(sev); + } if (ret) { exit(1); } @@ -840,6 +964,15 @@ sev_launch_get_measure(Notifier *notifier, void *unused) /* encode the measurement value and emit the event */ sev->measurement = g_base64_encode(data, measurement.len); trace_kvm_sev_launch_measurement(sev->measurement); + + /* Hygon CSV will auto load guest owner's secret */ + if (is_hygon_cpu()) { + if (sev->secret_header_file && + strlen(sev->secret_header_file) && + sev->secret_file && + strlen(sev->secret_file)) + csv_load_launch_secret(sev->secret_header_file, sev->secret_file); + } } static char *sev_get_launch_measurement(void) @@ -894,18 +1027,142 @@ sev_launch_finish(SevGuestState *sev) migrate_add_blocker(&sev_mig_blocker, &error_fatal); } +void +sev_del_migrate_blocker(void) +{ + migrate_del_blocker(&sev_mig_blocker); +} + +static int +sev_receive_finish(SevGuestState *s) +{ + int error, ret = 1; + + trace_kvm_sev_receive_finish(); + ret = sev_ioctl(s->sev_fd, KVM_SEV_RECEIVE_FINISH, 0, &error); + if (ret) { + error_report("%s: RECEIVE_FINISH ret=%d fw_error=%d '%s'", + __func__, ret, error, fw_error_to_str(error)); + goto err; + } + + sev_set_guest_state(s, SEV_STATE_RUNNING); +err: + return ret; +} + static void sev_vm_state_change(void *opaque, bool running, RunState state) { SevGuestState *sev = opaque; if (running) { - if (!sev_check_state(sev, SEV_STATE_RUNNING)) { + if (sev_check_state(sev, SEV_STATE_RECEIVE_UPDATE)) { + sev_receive_finish(sev); + } else if (!sev_check_state(sev, SEV_STATE_RUNNING)) { sev_launch_finish(sev); } } } +static inline bool check_blob_length(size_t value) +{ + if (value > SEV_FW_BLOB_MAX_SIZE) { + error_report("invalid length max=%d got=%ld", + SEV_FW_BLOB_MAX_SIZE, value); + return false; + } + + return true; +} + +int sev_save_setup(const char *pdh, const char *plat_cert, + const char *amd_cert) +{ + SevGuestState *s = sev_guest; + + if (is_hygon_cpu()) { + if (sev_read_file_base64(pdh, &s->remote_pdh, + &s->remote_pdh_len) < 0) { + goto error; + } + } else { + s->remote_pdh = g_base64_decode(pdh, &s->remote_pdh_len); + } + if (!check_blob_length(s->remote_pdh_len)) { + goto error; + } + + if (is_hygon_cpu()) { + if (sev_read_file_base64(plat_cert, &s->remote_plat_cert, + &s->remote_plat_cert_len) < 0) { + goto error; + } + } else { + s->remote_plat_cert = g_base64_decode(plat_cert, + &s->remote_plat_cert_len); + } + if (!check_blob_length(s->remote_plat_cert_len)) { + goto error; + } + + if (is_hygon_cpu()) { + if (sev_read_file_base64(amd_cert, &s->amd_cert, + &s->amd_cert_len) < 0) { + goto error; + } + } else { + s->amd_cert = g_base64_decode(amd_cert, &s->amd_cert_len); + } + if (!check_blob_length(s->amd_cert_len)) { + goto error; + } + + return 0; + +error: + g_free(s->remote_pdh); + g_free(s->remote_plat_cert); + g_free(s->amd_cert); + + return 1; +} + +static void +sev_send_finish(void) +{ + int ret, error; + + trace_kvm_sev_send_finish(); + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_FINISH, 0, &error); + if (ret) { + error_report("%s: SEND_FINISH ret=%d fw_error=%d '%s'", + __func__, ret, error, fw_error_to_str(error)); + } + + g_free(sev_guest->send_packet_hdr); + if (sev_es_enabled() && is_hygon_cpu()) { + g_free(sev_guest->send_vmsa_packet_hdr); + } + sev_set_guest_state(sev_guest, SEV_STATE_RUNNING); +} + +static void +sev_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + + if (migration_has_finished(s) || + migration_in_postcopy_after_devices(s) || + migration_has_failed(s)) { + if (sev_check_state(sev_guest, SEV_STATE_SEND_UPDATE)) { + sev_send_finish(); + } + } +} + +static Notifier sev_migration_state; + int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { SevGuestState *sev @@ -920,6 +1177,9 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) return 0; } + ConfidentialGuestSupportClass *cgs_class = + (ConfidentialGuestSupportClass *) object_get_class(OBJECT(cgs)); + ret = ram_block_discard_disable(true); if (ret) { error_report("%s: cannot disable RAM discard", __func__); @@ -996,22 +1256,90 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) } trace_kvm_sev_init(); - ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error); + + /* Only support reuse asid for CSV/CSV2 guest */ + if (is_hygon_cpu() && + (sev_guest->policy & GUEST_POLICY_REUSE_ASID)) { + char *user_id = NULL; + struct kvm_csv_init *init_cmd_buf = NULL; + + user_id = object_property_get_str(OBJECT(sev), "user-id", NULL); + if (user_id && strlen(user_id)) { + init_cmd_buf = g_new0(struct kvm_csv_init, 1); + init_cmd_buf->len = strlen(user_id); + init_cmd_buf->userid_addr = (__u64)user_id; + } + ret = sev_ioctl(sev->sev_fd, cmd, init_cmd_buf, &fw_error); + + if (user_id) { + g_free(user_id); + g_free(init_cmd_buf); + } + } else { + ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error); + } + if (ret) { error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'", __func__, ret, fw_error, fw_error_to_str(fw_error)); goto err; } - ret = sev_launch_start(sev); - if (ret) { - error_setg(errp, "%s: failed to create encryption context", __func__); - goto err; + /* Support CSV3 */ + if (!ret && cmd == KVM_SEV_ES_INIT) { + ret = csv3_init(sev_guest->policy, sev->sev_fd, (void *)&sev->state, &sev_ops); + if (ret) { + error_setg(errp, "%s: failed to init csv3 context", __func__); + goto err; + } + /* The CSV3 guest is not resettable */ + if (csv3_enabled()) + csv_kvm_cpu_reset_inhibit = true; + } + + /* + * The LAUNCH context is used for new guest, if its an incoming guest + * then RECEIVE context will be created after the connection is established. + */ + if (!runstate_check(RUN_STATE_INMIGRATE)) { + ret = sev_launch_start(sev); + if (ret) { + error_setg(errp, "%s: failed to create encryption context", __func__); + goto err; + } + } else { + /* + * The CSV2 guest is not resettable after migrated to target machine, + * set csv_kvm_cpu_reset_inhibit to true to indicate the CSV2 guest is + * not resettable. + */ + if (is_hygon_cpu() && sev_es_enabled()) { + csv_kvm_cpu_reset_inhibit = true; + } } - ram_block_notifier_add(&sev_ram_notifier); + /* CSV3 guest do not need notifier to reg/unreg memory */ + if (!csv3_enabled()) { + ram_block_notifier_add(&sev_ram_notifier); + } qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); + migration_add_notifier(&sev_migration_state, sev_migration_state_notifier); + + if (csv3_enabled()) { + cgs_class->memory_encryption_ops = &csv3_memory_encryption_ops; + } else { + cgs_class->memory_encryption_ops = &sev_memory_encryption_ops; + } + QTAILQ_INIT(&sev->shared_regions_list); + + /* Determine whether support MSR_AMD64_SEV_ES_GHCB */ + if (sev_es_enabled()) { + sev_kvm_has_msr_ghcb = + kvm_vm_check_extension(kvm_state, KVM_CAP_SEV_ES_GHCB); + } else { + sev_kvm_has_msr_ghcb = false; + } cgs->ready = true; @@ -1044,6 +1372,7 @@ sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) int sev_inject_launch_secret(const char *packet_hdr, const char *secret, uint64_t gpa, Error **errp) { + ERRP_GUARD(); struct kvm_sev_launch_secret input; g_autofree guchar *data = NULL, *hdr = NULL; int error, ret = 1; @@ -1087,7 +1416,17 @@ int sev_inject_launch_secret(const char *packet_hdr, const char *secret, input.trans_uaddr = (uint64_t)(unsigned long)data; input.trans_len = data_sz; - input.guest_uaddr = (uint64_t)(unsigned long)hva; + /* For Hygon CSV3 guest, the guest_uaddr should be the gpa */ + if (csv3_enabled()) { + if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET) { + input.guest_uaddr = gpa; + } else { + error_setg(errp, "CSV3 inject secret unsupported!"); + return 1; + } + } else { + input.guest_uaddr = (uint64_t)(unsigned long)hva; + } input.guest_len = data_sz; trace_kvm_sev_launch_secret(gpa, input.guest_uaddr, @@ -1251,74 +1590,1120 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size) return 0; } -static const QemuUUID sev_hash_table_header_guid = { - .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, - 0xd4, 0x11, 0xfd, 0x21) -}; +static int +sev_get_send_session_length(void) +{ + int ret, fw_err = 0; + struct kvm_sev_send_start start = {}; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_START, &start, &fw_err); + if (fw_err != SEV_RET_INVALID_LEN) { + ret = -1; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, fw_err, fw_error_to_str(fw_err)); + goto err; + } -static const QemuUUID sev_kernel_entry_guid = { - .data = UUID_LE(0x4de79437, 0xabd2, 0x427f, 0xb8, 0x35, 0xd5, 0xb1, - 0x72, 0xd2, 0x04, 0x5b) -}; -static const QemuUUID sev_initrd_entry_guid = { - .data = UUID_LE(0x44baf731, 0x3a2f, 0x4bd7, 0x9a, 0xf1, 0x41, 0xe2, - 0x91, 0x69, 0x78, 0x1d) -}; -static const QemuUUID sev_cmdline_entry_guid = { - .data = UUID_LE(0x97d02dd8, 0xbd20, 0x4c94, 0xaa, 0x78, 0xe7, 0x71, - 0x4d, 0x36, 0xab, 0x2a) -}; + ret = start.session_len; +err: + return ret; +} -/* - * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page - * which is included in SEV's initial memory measurement. - */ -bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) +static int +sev_send_start(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent) { - uint8_t *data; - SevHashTableDescriptor *area; - SevHashTable *ht; - PaddedSevHashTable *padded_ht; - uint8_t cmdline_hash[HASH_SIZE]; - uint8_t initrd_hash[HASH_SIZE]; - uint8_t kernel_hash[HASH_SIZE]; - uint8_t *hashp; - size_t hash_len = HASH_SIZE; - hwaddr mapped_len = sizeof(*padded_ht); - MemTxAttrs attrs = { 0 }; - bool ret = true; + gsize pdh_len = 0, plat_cert_len; + int session_len, ret, fw_error; + struct kvm_sev_send_start start = { }; + guchar *pdh = NULL, *plat_cert = NULL, *session = NULL; + Error *local_err = NULL; + + if (!s->remote_pdh || !s->remote_plat_cert || !s->amd_cert_len) { + error_report("%s: missing remote PDH or PLAT_CERT", __func__); + return 1; + } + + start.pdh_cert_uaddr = (uintptr_t) s->remote_pdh; + start.pdh_cert_len = s->remote_pdh_len; + + start.plat_certs_uaddr = (uintptr_t)s->remote_plat_cert; + start.plat_certs_len = s->remote_plat_cert_len; + + start.amd_certs_uaddr = (uintptr_t)s->amd_cert; + start.amd_certs_len = s->amd_cert_len; + + /* get the session length */ + session_len = sev_get_send_session_length(); + if (session_len < 0) { + ret = 1; + goto err; + } + + session = g_new0(guchar, session_len); + start.session_uaddr = (unsigned long)session; + start.session_len = session_len; + + /* Get our PDH certificate */ + ret = sev_get_pdh_info(s->sev_fd, &pdh, &pdh_len, + &plat_cert, &plat_cert_len, &local_err); + if (ret) { + error_report("Failed to get our PDH cert"); + goto err; + } + + trace_kvm_sev_send_start(start.pdh_cert_uaddr, start.pdh_cert_len, + start.plat_certs_uaddr, start.plat_certs_len, + start.amd_certs_uaddr, start.amd_certs_len); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_START, &start, &fw_error); + if (ret < 0) { + error_report("%s: SEND_START ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + qemu_put_be32(f, start.policy); + qemu_put_be32(f, pdh_len); + qemu_put_buffer(f, (uint8_t *)pdh, pdh_len); + qemu_put_be32(f, start.session_len); + qemu_put_buffer(f, (uint8_t *)start.session_uaddr, start.session_len); + *bytes_sent = 12 + pdh_len + start.session_len; + + sev_set_guest_state(s, SEV_STATE_SEND_UPDATE); + +err: + g_free(pdh); + g_free(plat_cert); + return ret; +} + +static int +sev_send_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_sev_send_update_data update = { 0, }; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_DATA, + &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + ret = 0; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + goto err; + } + + ret = update.hdr_len; + +err: + return ret; +} + +static int +sev_send_update_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr, uint32_t size, + uint64_t *bytes_sent) +{ + int ret, fw_error; + guchar *trans; + struct kvm_sev_send_update_data update = { }; /* - * Only add the kernel hashes if the sev-guest configuration explicitly - * stated kernel-hashes=on. + * If this is first call then query the packet header bytes and allocate + * the packet buffer. */ - if (!sev_guest->kernel_hashes) { - return false; - } + if (!s->send_packet_hdr) { + s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } - if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) { - error_setg(errp, "SEV: kernel specified but guest firmware " - "has no hashes table GUID"); - return false; + s->send_packet_hdr = g_new(gchar, s->send_packet_hdr_len); } - area = (SevHashTableDescriptor *)data; - if (!area->base || area->size < sizeof(PaddedSevHashTable)) { - error_setg(errp, "SEV: guest firmware hashes table area is invalid " - "(base=0x%x size=0x%x)", area->base, area->size); - return false; + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update.hdr_uaddr = (uintptr_t)s->send_packet_hdr; + update.hdr_len = s->send_packet_hdr_len; + update.guest_uaddr = (uintptr_t)ptr; + update.guest_len = size; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = size; + + trace_kvm_sev_send_update_data(ptr, trans, size); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_DATA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_UPDATE_DATA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; } + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent = 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += (4 + update.trans_len); + +err: + g_free(trans); + return ret; +} + +int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, + uint32_t sz, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + /* - * Calculate hash of kernel command-line with the terminating null byte. If - * the user doesn't supply a command-line via -append, the 1-byte "\0" will - * be used. + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. */ - hashp = cmdline_hash; - if (qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, ctx->cmdline_data, - ctx->cmdline_size, &hashp, &hash_len, errp) < 0) { - return false; + if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) && + sev_send_start(s, f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; } - assert(hash_len == HASH_SIZE); + + return sev_send_update_data(s, f, ptr, sz, bytes_sent); +} + +static int +sev_receive_start(SevGuestState *sev, QEMUFile *f) +{ + int ret = 1; + int fw_error; + struct kvm_sev_receive_start start = { }; + gchar *session = NULL, *pdh_cert = NULL; + + /* get SEV guest handle */ + start.handle = object_property_get_int(OBJECT(sev), "handle", + &error_abort); + + /* get the source policy */ + start.policy = qemu_get_be32(f); + + /* get source PDH key */ + start.pdh_len = qemu_get_be32(f); + if (!check_blob_length(start.pdh_len)) { + return 1; + } + + pdh_cert = g_new(gchar, start.pdh_len); + qemu_get_buffer(f, (uint8_t *)pdh_cert, start.pdh_len); + start.pdh_uaddr = (uintptr_t)pdh_cert; + + /* get source session data */ + start.session_len = qemu_get_be32(f); + if (!check_blob_length(start.session_len)) { + return 1; + } + session = g_new(gchar, start.session_len); + qemu_get_buffer(f, (uint8_t *)session, start.session_len); + start.session_uaddr = (uintptr_t)session; + + trace_kvm_sev_receive_start(start.policy, session, pdh_cert); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_START, + &start, &fw_error); + if (ret < 0) { + error_report("Error RECEIVE_START ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + object_property_set_int(OBJECT(sev), "handle", start.handle, &error_abort); + sev_set_guest_state(sev, SEV_STATE_RECEIVE_UPDATE); +err: + g_free(session); + g_free(pdh_cert); + + return ret; +} + +static int sev_receive_update_data(QEMUFile *f, uint8_t *ptr) +{ + int ret = 1, fw_error = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_data update = {}; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + if (!check_blob_length(update.hdr_len)) { + return 1; + } + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + if (!check_blob_length(update.trans_len)) { + goto err; + } + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + update.guest_uaddr = (uintptr_t) ptr; + update.guest_len = update.trans_len; + + trace_kvm_sev_receive_update_data(trans, ptr, update.guest_len, + hdr, update.hdr_len); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_DATA, + &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_UPDATE_DATA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + SevGuestState *s = sev_guest; + + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) && + sev_receive_start(s, f)) { + return 1; + } + + return sev_receive_update_data(f, ptr); +} + +int sev_remove_shared_regions_list(unsigned long start, unsigned long end) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos, *next_pos; + + QTAILQ_FOREACH_SAFE(pos, &s->shared_regions_list, list, next_pos) { + unsigned long l, r; + unsigned long curr_gfn_end = pos->gfn_end; + + /* + * Find if any intersection exists ? + * left bound for intersecting segment + */ + l = MAX(start, pos->gfn_start); + /* right bound for intersecting segment */ + r = MIN(end, pos->gfn_end); + if (l <= r) { + if (pos->gfn_start == l && pos->gfn_end == r) { + QTAILQ_REMOVE(&s->shared_regions_list, pos, list); + g_free(pos); + } else if (l == pos->gfn_start) { + pos->gfn_start = r; + } else if (r == pos->gfn_end) { + pos->gfn_end = l; + } else { + /* Do a de-merge -- split linked list nodes */ + struct shared_region *shrd_region; + + pos->gfn_end = l; + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return 0; + } + shrd_region->gfn_start = r; + shrd_region->gfn_end = curr_gfn_end; + QTAILQ_INSERT_AFTER(&s->shared_regions_list, pos, + shrd_region, list); + } + } + if (end <= curr_gfn_end) { + break; + } + } + return 0; +} + +int sev_add_shared_regions_list(unsigned long start, unsigned long end) +{ + struct shared_region *shrd_region; + struct shared_region *pos; + SevGuestState *s = sev_guest; + + if (QTAILQ_EMPTY(&s->shared_regions_list)) { + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return -1; + } + shrd_region->gfn_start = start; + shrd_region->gfn_end = end; + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + return 0; + } + + /* + * shared regions list is a sorted list in ascending order + * of guest PA's and also merges consecutive range of guest PA's + */ + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + /* handle duplicate overlapping regions */ + if (start >= pos->gfn_start && end <= pos->gfn_end) { + return 0; + } + if (pos->gfn_end < start) { + continue; + } + /* merge consecutive guest PA(s) -- forward merge */ + if (pos->gfn_start <= start && pos->gfn_end >= start) { + pos->gfn_end = end; + return 0; + } + break; + } + /* + * Add a new node + */ + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return -1; + } + shrd_region->gfn_start = start; + shrd_region->gfn_end = end; + if (pos) { + QTAILQ_INSERT_BEFORE(pos, shrd_region, list); + } else { + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + } + return 1; +} + +int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos; + + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + qemu_put_be32(f, SHARED_REGION_LIST_CONT); + qemu_put_be32(f, pos->gfn_start); + qemu_put_be32(f, pos->gfn_end); + *bytes_sent += 12; + } + + qemu_put_be32(f, SHARED_REGION_LIST_END); + *bytes_sent += 4; + return 0; +} + +int sev_load_incoming_shared_regions_list(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + struct shared_region *shrd_region; + int status; + + status = qemu_get_be32(f); + while (status == SHARED_REGION_LIST_CONT) { + + shrd_region = g_malloc0(sizeof(*shrd_region)); + if (!shrd_region) { + return 0; + } + shrd_region->gfn_start = qemu_get_be32(f); + shrd_region->gfn_end = qemu_get_be32(f); + + QTAILQ_INSERT_TAIL(&s->shared_regions_list, shrd_region, list); + + status = qemu_get_be32(f); + } + return 0; +} + +bool sev_is_gfn_in_unshared_region(unsigned long gfn) +{ + SevGuestState *s = sev_guest; + struct shared_region *pos; + + QTAILQ_FOREACH(pos, &s->shared_regions_list, list) { + if (gfn >= pos->gfn_start && gfn < pos->gfn_end) { + return false; + } + } + return true; +} + +static CsvBatchCmdList * +csv_batch_cmd_list_create(struct kvm_csv_batch_list_node *head, + CsvDestroyCmdNodeFn func) +{ + CsvBatchCmdList *csv_batch_cmd_list = + g_malloc0(sizeof(*csv_batch_cmd_list)); + + if (!csv_batch_cmd_list) { + return NULL; + } + + csv_batch_cmd_list->head = head; + csv_batch_cmd_list->tail = head; + csv_batch_cmd_list->destroy_fn = func; + + return csv_batch_cmd_list; +} + +static int +csv_batch_cmd_list_add_after(CsvBatchCmdList *list, + struct kvm_csv_batch_list_node *new_node) +{ + list->tail->next_cmd_addr = (__u64)new_node; + list->tail = new_node; + + return 0; +} + +static struct kvm_csv_batch_list_node * +csv_batch_cmd_list_node_create(uint64_t cmd_data_addr, uint64_t addr) +{ + struct kvm_csv_batch_list_node *new_node = + g_malloc0(sizeof(struct kvm_csv_batch_list_node)); + + if (!new_node) { + return NULL; + } + + new_node->cmd_data_addr = cmd_data_addr; + new_node->addr = addr; + new_node->next_cmd_addr = 0; + + return new_node; +} + +static int csv_batch_cmd_list_destroy(CsvBatchCmdList *list) +{ + struct kvm_csv_batch_list_node *node = list->head; + + while (node != NULL) { + if (list->destroy_fn != NULL) + list->destroy_fn((void *)node->cmd_data_addr); + + list->head = (struct kvm_csv_batch_list_node *)node->next_cmd_addr; + g_free(node); + node = list->head; + } + + g_free(list); + return 0; +} + +static void send_update_data_free(void *data) +{ + struct kvm_sev_send_update_data *update = + (struct kvm_sev_send_update_data *)data; + g_free((guchar *)update->hdr_uaddr); + g_free((guchar *)update->trans_uaddr); + g_free(update); +} + +static void receive_update_data_free(void *data) +{ + struct kvm_sev_receive_update_data *update = + (struct kvm_sev_receive_update_data *)data; + g_free((guchar *)update->hdr_uaddr); + g_free((guchar *)update->trans_uaddr); + g_free(update); +} + +static int +csv_send_queue_data(SevGuestState *s, uint8_t *ptr, + uint32_t size, uint64_t addr) +{ + int ret = 0; + int fw_error; + guchar *trans; + guchar *packet_hdr; + struct kvm_sev_send_update_data *update; + struct kvm_csv_batch_list_node *new_node = NULL; + + /* If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (s->send_packet_hdr_len < 1) { + s->send_packet_hdr_len = sev_send_get_packet_len(&fw_error); + if (s->send_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + } + + packet_hdr = g_new(guchar, s->send_packet_hdr_len); + memset(packet_hdr, 0, s->send_packet_hdr_len); + + update = g_new0(struct kvm_sev_send_update_data, 1); + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update->hdr_uaddr = (unsigned long)packet_hdr; + update->hdr_len = s->send_packet_hdr_len; + update->guest_uaddr = (unsigned long)ptr; + update->guest_len = size; + update->trans_uaddr = (unsigned long)trans; + update->trans_len = size; + + new_node = csv_batch_cmd_list_node_create((uint64_t)update, addr); + if (!new_node) { + ret = -ENOMEM; + goto err; + } + + if (s->csv_batch_cmd_list == NULL) { + s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node, + send_update_data_free); + if (s->csv_batch_cmd_list == NULL) { + ret = -ENOMEM; + goto err; + } + } else { + /* Add new_node's command address to the last_node */ + csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node); + } + + trace_kvm_sev_send_update_data(ptr, trans, size); + + return ret; + +err: + g_free(trans); + g_free(update); + g_free(packet_hdr); + g_free(new_node); + if (s->csv_batch_cmd_list) { + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + } + return ret; +} + +static int +csv_receive_queue_data(SevGuestState *s, QEMUFile *f, uint8_t *ptr) +{ + int ret = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_data *update; + struct kvm_csv_batch_list_node *new_node = NULL; + + update = g_new0(struct kvm_sev_receive_update_data, 1); + /* get packet header */ + update->hdr_len = qemu_get_be32(f); + hdr = g_new(gchar, update->hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update->hdr_len); + update->hdr_uaddr = (unsigned long)hdr; + + /* get transport buffer */ + update->trans_len = qemu_get_be32(f); + trans = g_new(gchar, update->trans_len); + update->trans_uaddr = (unsigned long)trans; + qemu_get_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len); + + /* set guest address,guest len is page_size */ + update->guest_uaddr = (uint64_t)ptr; + update->guest_len = TARGET_PAGE_SIZE; + + new_node = csv_batch_cmd_list_node_create((uint64_t)update, 0); + if (!new_node) { + ret = -ENOMEM; + goto err; + } + + if (s->csv_batch_cmd_list == NULL) { + s->csv_batch_cmd_list = csv_batch_cmd_list_create(new_node, + receive_update_data_free); + if (s->csv_batch_cmd_list == NULL) { + ret = -ENOMEM; + goto err; + } + } else { + /* Add new_node's command address to the last_node */ + csv_batch_cmd_list_add_after(s->csv_batch_cmd_list, new_node); + } + + trace_kvm_sev_receive_update_data(trans, (void *)ptr, update->guest_len, + (void *)hdr, update->hdr_len); + + return ret; + +err: + g_free(trans); + g_free(update); + g_free(hdr); + g_free(new_node); + if (s->csv_batch_cmd_list) { + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + } + return ret; +} + +static int +csv_command_batch(uint32_t cmd_id, uint64_t head_uaddr, int *fw_err) +{ + int ret; + struct kvm_csv_command_batch command_batch = { }; + + command_batch.command_id = cmd_id; + command_batch.csv_batch_list_uaddr = head_uaddr; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_CSV_COMMAND_BATCH, + &command_batch, fw_err); + if (ret) { + error_report("%s: COMMAND_BATCH ret=%d fw_err=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + } + + return ret; +} + +static int +csv_send_update_data_batch(SevGuestState *s, QEMUFile *f, uint64_t *bytes_sent) +{ + int ret, fw_error = 0; + struct kvm_sev_send_update_data *update; + struct kvm_csv_batch_list_node *node; + + ret = csv_command_batch(KVM_SEV_SEND_UPDATE_DATA, + (uint64_t)s->csv_batch_cmd_list->head, &fw_error); + if (ret) { + error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + for (node = s->csv_batch_cmd_list->head; + node != NULL; + node = (struct kvm_csv_batch_list_node *)node->next_cmd_addr) { + if (node != s->csv_batch_cmd_list->head) { + /* head's page header is saved before send_update_data */ + qemu_put_be64(f, node->addr); + *bytes_sent += 8; + if (node->next_cmd_addr != 0) + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH); + else + qemu_put_be32(f, RAM_SAVE_ENCRYPTED_PAGE_BATCH_END); + *bytes_sent += 4; + } + update = (struct kvm_sev_send_update_data *)node->cmd_data_addr; + qemu_put_be32(f, update->hdr_len); + qemu_put_buffer(f, (uint8_t *)update->hdr_uaddr, update->hdr_len); + *bytes_sent += (4 + update->hdr_len); + + qemu_put_be32(f, update->trans_len); + qemu_put_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len); + *bytes_sent += (4 + update->trans_len); + } + +err: + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + return ret; +} + +static int +csv_receive_update_data_batch(SevGuestState *s) +{ + int ret; + int fw_error; + + ret = csv_command_batch(KVM_SEV_RECEIVE_UPDATE_DATA, + (uint64_t)s->csv_batch_cmd_list->head, &fw_error); + if (ret) { + error_report("%s: csv_command_batch ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + } + + csv_batch_cmd_list_destroy(s->csv_batch_cmd_list); + s->csv_batch_cmd_list = NULL; + return ret; +} + +int +csv_queue_outgoing_page(uint8_t *ptr, uint32_t sz, uint64_t addr) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support enqueue pages for HYGON CSV"); + return -EINVAL; + } + + return csv_send_queue_data(s, ptr, sz, addr); +} + +int csv_queue_incoming_page(QEMUFile *f, uint8_t *ptr) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support enqueue received pages for HYGON CSV"); + return -EINVAL; + } + + /* + * If this is first buffer and SEV is not in recieiving state then + * use RECEIVE_START command to create a encryption context. + */ + if (!sev_check_state(s, SEV_STATE_RECEIVE_UPDATE) && + sev_receive_start(s, f)) { + return 1; + } + + return csv_receive_queue_data(s, f, ptr); +} + +int +csv_save_queued_outgoing_pages(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support transfer queued pages for HYGON CSV"); + return -EINVAL; + } + + /* + * If this is a first buffer then create outgoing encryption context + * and write our PDH, policy and session data. + */ + if (!sev_check_state(s, SEV_STATE_SEND_UPDATE) && + sev_send_start(s, f, bytes_sent)) { + error_report("Failed to create outgoing context"); + return 1; + } + + return csv_send_update_data_batch(s, f, bytes_sent); +} + +int csv_load_queued_incoming_pages(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + + /* Only support for HYGON CSV */ + if (!is_hygon_cpu()) { + error_report("Only support load queued pages for HYGON CSV"); + return -EINVAL; + } + + return csv_receive_update_data_batch(s); +} + +static int +sev_send_vmsa_get_packet_len(int *fw_err) +{ + int ret; + struct kvm_sev_send_update_vmsa update = { 0, }; + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_SEND_UPDATE_VMSA, + &update, fw_err); + if (*fw_err != SEV_RET_INVALID_LEN) { + ret = 0; + error_report("%s: failed to get session length ret=%d fw_error=%d '%s'", + __func__, ret, *fw_err, fw_error_to_str(*fw_err)); + goto err; + } + + ret = update.hdr_len; + +err: + return ret; +} + +static int +sev_send_update_vmsa(SevGuestState *s, QEMUFile *f, uint32_t cpu_id, + uint32_t cpu_index, uint32_t size, uint64_t *bytes_sent) +{ + int ret, fw_error; + guchar *trans = NULL; + struct kvm_sev_send_update_vmsa update = {}; + + /* + * If this is first call then query the packet header bytes and allocate + * the packet buffer. + */ + if (!s->send_vmsa_packet_hdr) { + s->send_vmsa_packet_hdr_len = sev_send_vmsa_get_packet_len(&fw_error); + if (s->send_vmsa_packet_hdr_len < 1) { + error_report("%s: SEND_UPDATE_VMSA fw_error=%d '%s'", + __func__, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + s->send_vmsa_packet_hdr = g_new(gchar, s->send_vmsa_packet_hdr_len); + } + + /* allocate transport buffer */ + trans = g_new(guchar, size); + + update.vcpu_id = cpu_id; + update.hdr_uaddr = (uintptr_t)s->send_vmsa_packet_hdr; + update.hdr_len = s->send_vmsa_packet_hdr_len; + update.trans_uaddr = (uintptr_t)trans; + update.trans_len = size; + + trace_kvm_sev_send_update_vmsa(cpu_id, cpu_index, trans, size); + + ret = sev_ioctl(s->sev_fd, KVM_SEV_SEND_UPDATE_VMSA, &update, &fw_error); + if (ret) { + error_report("%s: SEND_UPDATE_VMSA ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + goto err; + } + + /* + * Migration of vCPU's VMState according to the instance_id + * (i.e. CPUState.cpu_index) + */ + qemu_put_be32(f, sizeof(uint32_t)); + qemu_put_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t)); + *bytes_sent += 4 + sizeof(uint32_t); + + qemu_put_be32(f, update.hdr_len); + qemu_put_buffer(f, (uint8_t *)update.hdr_uaddr, update.hdr_len); + *bytes_sent += 4 + update.hdr_len; + + qemu_put_be32(f, update.trans_len); + qemu_put_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + *bytes_sent += 4 + update.trans_len; + +err: + g_free(trans); + return ret; +} + +int csv_save_outgoing_cpu_state(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + CPUState *cpu; + int ret = 0; + + /* Only support migrate VMSAs for HYGON CSV2 guest */ + if (!sev_es_enabled() || !is_hygon_cpu()) { + return 0; + } + + CPU_FOREACH(cpu) { + qemu_put_be32(f, ENCRYPTED_CPU_STATE_CONT); + *bytes_sent += 4; + ret = sev_send_update_vmsa(s, f, kvm_arch_vcpu_id(cpu), + cpu->cpu_index, TARGET_PAGE_SIZE, bytes_sent); + if (ret) { + goto err; + } + } + + qemu_put_be32(f, ENCRYPTED_CPU_STATE_END); + *bytes_sent += 4; + +err: + return ret; +} + +static int sev_receive_update_vmsa(QEMUFile *f) +{ + int ret = 1, fw_error = 0; + CPUState *cpu; + uint32_t cpu_index, cpu_id = 0; + gchar *hdr = NULL, *trans = NULL; + struct kvm_sev_receive_update_vmsa update = {}; + + /* get cpu index buffer */ + assert(qemu_get_be32(f) == sizeof(uint32_t)); + qemu_get_buffer(f, (uint8_t *)&cpu_index, sizeof(uint32_t)); + + CPU_FOREACH(cpu) { + if (cpu->cpu_index == cpu_index) { + cpu_id = kvm_arch_vcpu_id(cpu); + break; + } + } + update.vcpu_id = cpu_id; + + /* get packet header */ + update.hdr_len = qemu_get_be32(f); + if (!check_blob_length(update.hdr_len)) { + return 1; + } + + hdr = g_new(gchar, update.hdr_len); + qemu_get_buffer(f, (uint8_t *)hdr, update.hdr_len); + update.hdr_uaddr = (uintptr_t)hdr; + + /* get transport buffer */ + update.trans_len = qemu_get_be32(f); + if (!check_blob_length(update.trans_len)) { + goto err; + } + + trans = g_new(gchar, update.trans_len); + update.trans_uaddr = (uintptr_t)trans; + qemu_get_buffer(f, (uint8_t *)update.trans_uaddr, update.trans_len); + + trace_kvm_sev_receive_update_vmsa(cpu_id, cpu_index, + trans, update.trans_len, hdr, update.hdr_len); + + ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_RECEIVE_UPDATE_VMSA, + &update, &fw_error); + if (ret) { + error_report("Error RECEIVE_UPDATE_VMSA ret=%d fw_error=%d '%s'", + ret, fw_error, fw_error_to_str(fw_error)); + } + +err: + g_free(trans); + g_free(hdr); + return ret; +} + +int csv_load_incoming_cpu_state(QEMUFile *f) +{ + int status, ret = 0; + + /* Only support migrate VMSAs for HYGON CSV2 guest */ + if (!sev_es_enabled() || !is_hygon_cpu()) { + return 0; + } + + status = qemu_get_be32(f); + while (status == ENCRYPTED_CPU_STATE_CONT) { + ret = sev_receive_update_vmsa(f); + if (ret) { + break; + } + + status = qemu_get_be32(f); + } + + return ret; +} + +static int +csv_load_launch_secret(const char *secret_header_file, const char *secret_file) +{ + gsize secret_header_size, secret_size; + gchar *secret_header = NULL, *secret = NULL; + uint8_t *data; + struct sev_secret_area *area; + uint64_t gpa; + GError *error = NULL; + Error *local_err = NULL; + int ret = 0; + + if (!g_file_get_contents(secret_header_file, + &secret_header, + &secret_header_size, &error)) { + error_report("CSV: Failed to read '%s' (%s)", + secret_header_file, error->message); + g_error_free(error); + return -1; + } + + if (!g_file_get_contents(secret_file, &secret, &secret_size, &error)) { + error_report("CSV: Failed to read '%s' (%s)", secret_file, error->message); + g_error_free(error); + return -1; + } + + if (!pc_system_ovmf_table_find(SEV_SECRET_GUID, &data, NULL)) { + error_report("CSV: no secret area found in OVMF, gpa must be" + " specified."); + return -1; + } + area = (struct sev_secret_area *)data; + gpa = area->base; + + ret = sev_inject_launch_secret((char *)secret_header, + (char *)secret, gpa, &local_err); + + if (local_err) { + error_report_err(local_err); + } + return ret; +} + +static const QemuUUID sev_hash_table_header_guid = { + .data = UUID_LE(0x9438d606, 0x4f22, 0x4cc9, 0xb4, 0x79, 0xa7, 0x93, + 0xd4, 0x11, 0xfd, 0x21) +}; + +static const QemuUUID sev_kernel_entry_guid = { + .data = UUID_LE(0x4de79437, 0xabd2, 0x427f, 0xb8, 0x35, 0xd5, 0xb1, + 0x72, 0xd2, 0x04, 0x5b) +}; +static const QemuUUID sev_initrd_entry_guid = { + .data = UUID_LE(0x44baf731, 0x3a2f, 0x4bd7, 0x9a, 0xf1, 0x41, 0xe2, + 0x91, 0x69, 0x78, 0x1d) +}; +static const QemuUUID sev_cmdline_entry_guid = { + .data = UUID_LE(0x97d02dd8, 0xbd20, 0x4c94, 0xaa, 0x78, 0xe7, 0x71, + 0x4d, 0x36, 0xab, 0x2a) +}; + +/* + * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page + * which is included in SEV's initial memory measurement. + */ +bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) +{ + uint8_t *data; + SevHashTableDescriptor *area; + SevHashTable *ht; + PaddedSevHashTable *padded_ht; + uint8_t cmdline_hash[HASH_SIZE]; + uint8_t initrd_hash[HASH_SIZE]; + uint8_t kernel_hash[HASH_SIZE]; + uint8_t *hashp; + size_t hash_len = HASH_SIZE; + hwaddr mapped_len = sizeof(*padded_ht); + MemTxAttrs attrs = { 0 }; + bool ret = true; + + /* + * Only add the kernel hashes if the sev-guest configuration explicitly + * stated kernel-hashes=on. + */ + if (!sev_guest->kernel_hashes) { + return false; + } + + if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) { + error_setg(errp, "SEV: kernel specified but guest firmware " + "has no hashes table GUID"); + return false; + } + area = (SevHashTableDescriptor *)data; + if (!area->base || area->size < sizeof(PaddedSevHashTable)) { + error_setg(errp, "SEV: guest firmware hashes table area is invalid " + "(base=0x%x size=0x%x)", area->base, area->size); + return false; + } + + /* + * Calculate hash of kernel command-line with the terminating null byte. If + * the user doesn't supply a command-line via -append, the 1-byte "\0" will + * be used. + */ + hashp = cmdline_hash; + if (qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA256, ctx->cmdline_data, + ctx->cmdline_size, &hashp, &hash_len, errp) < 0) { + return false; + } + assert(hash_len == HASH_SIZE); /* * Calculate hash of initrd. If the user doesn't supply an initrd via @@ -1373,7 +2758,17 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) /* zero the excess data so the measurement can be reliably calculated */ memset(padded_ht->padding, 0, sizeof(padded_ht->padding)); - if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) { + if (csv3_enabled()) { + if (kvm_hygon_coco_ext_inuse & KVM_CAP_HYGON_COCO_EXT_CSV3_MULT_LUP_DATA) { + if (csv3_load_data(area->base, (uint8_t *)padded_ht, + sizeof(*padded_ht), errp) < 0) { + ret = false; + } + } else { + error_report("%s: CSV3 load kernel hashes unsupported!", __func__); + ret = false; + } + } else if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) { ret = false; } @@ -1383,6 +2778,27 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) return ret; } +static int _sev_send_start(QEMUFile *f, uint64_t *bytes_sent) +{ + SevGuestState *s = sev_guest; + + return sev_send_start(s, f, bytes_sent); +} + +static int _sev_receive_start(QEMUFile *f) +{ + SevGuestState *s = sev_guest; + + return sev_receive_start(s, f); +} + +struct sev_ops sev_ops = { + .sev_ioctl = sev_ioctl, + .fw_error_to_str = fw_error_to_str, + .sev_send_start = _sev_send_start, + .sev_receive_start = _sev_receive_start, +}; + static void sev_register_types(void) { diff --git a/target/i386/sev.h b/target/i386/sev.h index e7499c95b1e87cd6ecdff2c28009b65807b52946..647b426b16f4b4c4d003a8a59ac07f42f966f112 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -38,6 +38,13 @@ typedef struct SevKernelLoaderContext { size_t cmdline_size; } SevKernelLoaderContext; +#define RAM_SAVE_ENCRYPTED_PAGE 0x1 +#define RAM_SAVE_SHARED_REGIONS_LIST 0x2 + +#define RAM_SAVE_ENCRYPTED_PAGE_BATCH 0x4 +#define RAM_SAVE_ENCRYPTED_PAGE_BATCH_END 0x5 +#define RAM_SAVE_ENCRYPTED_CPU_STATE 0x6 + #ifdef CONFIG_SEV bool sev_enabled(void); bool sev_es_enabled(void); @@ -51,12 +58,35 @@ uint32_t sev_get_reduced_phys_bits(void); bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp); int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); +int sev_save_setup(const char *pdh, const char *plat_cert, + const char *amd_cert); +int sev_save_outgoing_page(QEMUFile *f, uint8_t *ptr, + uint32_t size, uint64_t *bytes_sent); +int sev_load_incoming_page(QEMUFile *f, uint8_t *ptr); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size); void sev_es_set_reset_vector(CPUState *cpu); +int sev_remove_shared_regions_list(unsigned long gfn_start, + unsigned long gfn_end); +int sev_add_shared_regions_list(unsigned long gfn_start, unsigned long gfn_end); +int sev_save_outgoing_shared_regions_list(QEMUFile *f, uint64_t *bytes_sent); +int sev_load_incoming_shared_regions_list(QEMUFile *f); +bool sev_is_gfn_in_unshared_region(unsigned long gfn); +void sev_del_migrate_blocker(void); int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +extern bool sev_kvm_has_msr_ghcb; + +struct sev_ops { + int (*sev_ioctl)(int fd, int cmd, void *data, int *error); + const char *(*fw_error_to_str)(int code); + int (*sev_send_start)(QEMUFile *f, uint64_t *bytes_sent); + int (*sev_receive_start)(QEMUFile *f); +}; + +extern struct sev_ops sev_ops; + #endif diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index 5b86f439add2a920cd3ee0054e3bf329339854b5..294dbc50e2b0bcf73f1ddddea65f0ebb51db21b3 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -107,6 +107,10 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) { uint32_t cmp; + CPUState *cpu = env_cpu(in->env); + /* We are in cpu_exec, and start_exclusive can't be called directly.*/ + g_assert(cpu->running); + cpu_exec_end(cpu); /* Does x86 really perform a rmw cycle on mmio for ptw? */ start_exclusive(); cmp = cpu_ldl_mmuidx_ra(in->env, in->gaddr, in->ptw_idx, 0); @@ -114,6 +118,7 @@ static bool ptw_setl_slow(const PTETranslate *in, uint32_t old, uint32_t new) cpu_stl_mmuidx_ra(in->env, in->gaddr, new, in->ptw_idx, 0); } end_exclusive(); + cpu_exec_start(cpu); return cmp == old; } diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 037bc47e7c27ca01e955aa92e9c3e660cd9a9a77..19b8250452c30fbde3c755675dfed39cf7558897 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2661,7 +2661,7 @@ static void gen_enter(DisasContext *s, int esp_addend, int level) } /* Copy the FrameTemp value to EBP. */ - gen_op_mov_reg_v(s, a_ot, R_EBP, s->T1); + gen_op_mov_reg_v(s, d_ot, R_EBP, s->T1); /* Compute the final value of ESP. */ tcg_gen_subi_tl(s->T1, s->T1, esp_addend + size * level); @@ -2790,7 +2790,7 @@ do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, bool jr) if (recheck_tf) { gen_helper_rechecking_single_step(tcg_env); tcg_gen_exit_tb(NULL, 0); - } else if (s->flags & HF_TF_MASK) { + } else if ((s->flags & HF_TF_MASK) && !inhibit) { gen_helper_single_step(tcg_env); } else if (jr) { tcg_gen_lookup_and_goto_ptr(); diff --git a/target/i386/trace-events b/target/i386/trace-events index 2cd8726eebb7d42b70a14ba3c128290b428847da..5d4a709a39dedc3f4b0a6ad27f4e51fe25fcdac9 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -11,3 +11,19 @@ kvm_sev_launch_measurement(const char *value) "data %s" kvm_sev_launch_finish(void) "" kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d" kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s" +kvm_sev_send_start(uint64_t pdh, int l1, uint64_t plat, int l2, uint64_t amd, int l3) "pdh 0x%" PRIx64 " len %d plat 0x%" PRIx64 " len %d amd 0x%" PRIx64 " len %d" +kvm_sev_send_update_data(void *src, void *dst, int len) "guest %p trans %p len %d" +kvm_sev_send_finish(void) "" +kvm_sev_receive_start(int policy, void *session, void *pdh) "policy 0x%x session %p pdh %p" +kvm_sev_receive_update_data(void *src, void *dst, int len, void *hdr, int hdr_len) "guest %p trans %p len %d hdr %p hdr_len %d" +kvm_sev_receive_finish(void) "" +kvm_sev_send_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *dst, int len) "cpu_id %d cpu_index %d trans %p len %d" +kvm_sev_receive_update_vmsa(uint32_t cpu_id, uint32_t cpu_index, void *src, int len, void *hdr, int hdr_len) "cpu_id %d cpu_index %d trans %p len %d hdr %p hdr_len %d" + +# csv.c +kvm_csv3_launch_encrypt_data(uint64_t gpa, void *addr, uint64_t len) "gpa 0x%" PRIx64 " addr %p len 0x%" PRIx64 +kvm_csv3_send_encrypt_data(void *dst, int len) "trans %p len %d" +kvm_csv3_send_encrypt_context(void *dst, int len) "trans %p len %d" +kvm_csv3_receive_encrypt_data(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" +kvm_csv3_receive_encrypt_context(void *dst, int len, void *hdr, int hdr_len) "trans %p len %d hdr %p hdr_len %d" +kvm_csv3_set_guest_private_memory(void) "" diff --git a/target/loongarch/arch_dump.c b/target/loongarch/arch_dump.c new file mode 100644 index 0000000000000000000000000000000000000000..d9e1120333ce1a345e0153118cd6f18f54e7308b --- /dev/null +++ b/target/loongarch/arch_dump.c @@ -0,0 +1,163 @@ +/* + * Support for writing ELF notes for LoongArch architectures + * + * Copyright (c) 2023 Loongarch Technology + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "elf.h" +#include "sysemu/dump.h" +#include "internals.h" + +/* struct user_pt_regs from arch/loongarch/include/uapi/asm/ptrace.h */ +struct loongarch_user_regs { + uint64_t gpr[32]; + uint64_t pad1[1]; + /* Special CSR registers. */ + uint64_t csr_era; + uint64_t csr_badv; + uint64_t pad2[10]; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_user_regs) != 360); + +/* struct elf_prstatus from include/uapi/linux/elfcore.h */ +struct loongarch_elf_prstatus { + char pad1[32]; /* 32 == offsetof(struct elf_prstatus, pr_pid) */ + uint32_t pr_pid; + /* + * 76 == offsetof(struct elf_prstatus, pr_reg) - + * offsetof(struct elf_prstatus, pr_ppid) + */ + char pad2[76]; + struct loongarch_user_regs pr_reg; + uint32_t pr_fpvalid; + char pad3[4]; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_elf_prstatus) != 480); + +/* struct user_fp_state from arch/loongarch/include/uapi/asm/ptrace.h */ +struct loongarch_fpu_struct { + uint64_t fpr[32]; + uint64_t fcc; + unsigned int fcsr; +} QEMU_PACKED; + +QEMU_BUILD_BUG_ON(sizeof(struct loongarch_fpu_struct) != 268); + +struct loongarch_note { + Elf64_Nhdr hdr; + char name[8]; /* align_up(sizeof("CORE"), 4) */ + union { + struct loongarch_elf_prstatus prstatus; + struct loongarch_fpu_struct fpu; + }; +} QEMU_PACKED; + +#define LOONGARCH_NOTE_HEADER_SIZE offsetof(struct loongarch_note, prstatus) +#define LOONGARCH_PRSTATUS_NOTE_SIZE \ + (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_elf_prstatus)) +#define LOONGARCH_PRFPREG_NOTE_SIZE \ + (LOONGARCH_NOTE_HEADER_SIZE + sizeof(struct loongarch_fpu_struct)) + +static void loongarch_note_init(struct loongarch_note *note, DumpState *s, + const char *name, Elf64_Word namesz, + Elf64_Word type, Elf64_Word descsz) +{ + memset(note, 0, sizeof(*note)); + + note->hdr.n_namesz = cpu_to_dump32(s, namesz); + note->hdr.n_descsz = cpu_to_dump32(s, descsz); + note->hdr.n_type = cpu_to_dump32(s, type); + + memcpy(note->name, name, namesz); +} + +static int loongarch_write_elf64_fprpreg(WriteCoreDumpFunction f, + CPULoongArchState *env, int cpuid, + DumpState *s) +{ + struct loongarch_note note; + int ret, i; + + loongarch_note_init(¬e, s, "CORE", 5, NT_PRFPREG, sizeof(note.fpu)); + note.fpu.fcsr = cpu_to_dump64(s, env->fcsr0); + note.fpu.fcc = cpu_to_dump64(s, read_fcc(env)); + + for (i = 0; i < 32; ++i) { + note.fpu.fpr[i] = cpu_to_dump64(s, env->fpr[i].vreg.UD[0]); + } + + ret = f(¬e, LOONGARCH_PRFPREG_NOTE_SIZE, s); + if (ret < 0) { + return -1; + } + + return 0; +} + +int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, + int cpuid, DumpState *s) +{ + struct loongarch_note note; + CPULoongArchState *env = &LOONGARCH_CPU(cs)->env; + int ret, i; + + loongarch_note_init(¬e, s, "CORE", 5, NT_PRSTATUS, + sizeof(note.prstatus)); + note.prstatus.pr_pid = cpu_to_dump32(s, cpuid); + note.prstatus.pr_fpvalid = cpu_to_dump32(s, 1); + + for (i = 0; i < 32; ++i) { + note.prstatus.pr_reg.gpr[i] = cpu_to_dump64(s, env->gpr[i]); + } + note.prstatus.pr_reg.csr_era = cpu_to_dump64(s, env->CSR_ERA); + note.prstatus.pr_reg.csr_badv = cpu_to_dump64(s, env->CSR_BADV); + ret = f(¬e, LOONGARCH_PRSTATUS_NOTE_SIZE, s); + if (ret < 0) { + return -1; + } + + ret = loongarch_write_elf64_fprpreg(f, env, cpuid, s); + if (ret < 0) { + return -1; + } + + return ret; +} + +int cpu_get_dump_info(ArchDumpInfo *info, + const GuestPhysBlockList *guest_phys_blocks) +{ + info->d_machine = EM_LOONGARCH; + info->d_endian = ELFDATA2LSB; + info->d_class = ELFCLASS64; + + return 0; +} + +ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) +{ + size_t note_size = 0; + + if (class == ELFCLASS64) { + note_size = LOONGARCH_PRSTATUS_NOTE_SIZE + LOONGARCH_PRFPREG_NOTE_SIZE; + } + + return note_size * nr_cpus; +} diff --git a/target/loongarch/cpu-csr.h b/target/loongarch/cpu-csr.h index c59d7a9fcbc4c570d7b0c410e7c05768cc39d24f..0834e91f30e3bfcfd894f8e7e1e49e930f7025bb 100644 --- a/target/loongarch/cpu-csr.h +++ b/target/loongarch/cpu-csr.h @@ -67,6 +67,9 @@ FIELD(TLBENTRY, D, 1, 1) FIELD(TLBENTRY, PLV, 2, 2) FIELD(TLBENTRY, MAT, 4, 2) FIELD(TLBENTRY, G, 6, 1) +FIELD(TLBENTRY, HUGE, 6, 1) +FIELD(TLBENTRY, HGLOBAL, 12, 1) +FIELD(TLBENTRY, LEVEL, 13, 2) FIELD(TLBENTRY_32, PPN, 8, 24) FIELD(TLBENTRY_64, PPN, 12, 36) FIELD(TLBENTRY_64, NR, 61, 1) diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index fc075952e6356060fe46b8530ea1267765d7a867..561566f3a04971760acea2c48c6f1ec325b706e6 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -8,18 +8,29 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "qemu/qemu-print.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "qemu/module.h" #include "sysemu/qtest.h" -#include "exec/cpu_ldst.h" +#include "sysemu/tcg.h" +#include "sysemu/kvm.h" +#include "kvm/kvm_loongarch.h" #include "exec/exec-all.h" #include "cpu.h" +#include "hw/qdev-properties.h" #include "internals.h" #include "fpu/softfloat-helpers.h" #include "cpu-csr.h" +#include "qapi/visitor.h" #include "sysemu/reset.h" -#include "tcg/tcg.h" #include "vec.h" +#ifdef CONFIG_KVM +#include +#endif +#ifdef CONFIG_TCG +#include "exec/cpu_ldst.h" +#include "tcg/tcg.h" +#endif const char * const regnames[32] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", @@ -35,33 +46,45 @@ const char * const fregnames[32] = { "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", }; -static const char * const excp_names[] = { - [EXCCODE_INT] = "Interrupt", - [EXCCODE_PIL] = "Page invalid exception for load", - [EXCCODE_PIS] = "Page invalid exception for store", - [EXCCODE_PIF] = "Page invalid exception for fetch", - [EXCCODE_PME] = "Page modified exception", - [EXCCODE_PNR] = "Page Not Readable exception", - [EXCCODE_PNX] = "Page Not Executable exception", - [EXCCODE_PPI] = "Page Privilege error", - [EXCCODE_ADEF] = "Address error for instruction fetch", - [EXCCODE_ADEM] = "Address error for Memory access", - [EXCCODE_SYS] = "Syscall", - [EXCCODE_BRK] = "Break", - [EXCCODE_INE] = "Instruction Non-Existent", - [EXCCODE_IPE] = "Instruction privilege error", - [EXCCODE_FPD] = "Floating Point Disabled", - [EXCCODE_FPE] = "Floating Point Exception", - [EXCCODE_DBP] = "Debug breakpoint", - [EXCCODE_BCE] = "Bound Check Exception", - [EXCCODE_SXD] = "128 bit vector instructions Disable exception", - [EXCCODE_ASXD] = "256 bit vector instructions Disable exception", +struct TypeExcp { + int32_t exccode; + const char * const name; +}; + +static const struct TypeExcp excp_names[] = { + {EXCCODE_INT, "Interrupt"}, + {EXCCODE_PIL, "Page invalid exception for load"}, + {EXCCODE_PIS, "Page invalid exception for store"}, + {EXCCODE_PIF, "Page invalid exception for fetch"}, + {EXCCODE_PME, "Page modified exception"}, + {EXCCODE_PNR, "Page Not Readable exception"}, + {EXCCODE_PNX, "Page Not Executable exception"}, + {EXCCODE_PPI, "Page Privilege error"}, + {EXCCODE_ADEF, "Address error for instruction fetch"}, + {EXCCODE_ADEM, "Address error for Memory access"}, + {EXCCODE_SYS, "Syscall"}, + {EXCCODE_BRK, "Break"}, + {EXCCODE_INE, "Instruction Non-Existent"}, + {EXCCODE_IPE, "Instruction privilege error"}, + {EXCCODE_FPD, "Floating Point Disabled"}, + {EXCCODE_FPE, "Floating Point Exception"}, + {EXCCODE_DBP, "Debug breakpoint"}, + {EXCCODE_BCE, "Bound Check Exception"}, + {EXCCODE_SXD, "128 bit vector instructions Disable exception"}, + {EXCCODE_ASXD, "256 bit vector instructions Disable exception"}, + {EXCP_HLT, "EXCP_HLT"}, }; const char *loongarch_exception_name(int32_t exception) { - assert(excp_names[exception]); - return excp_names[exception]; + int i; + + for (i = 0; i < ARRAY_SIZE(excp_names); i++) { + if (excp_names[i].exccode == exception) { + return excp_names[i].name; + } + } + return "Unknown"; } void G_NORETURN do_raise_exception(CPULoongArchState *env, @@ -70,7 +93,7 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env, { CPUState *cs = env_cpu(env); - qemu_log_mask(CPU_LOG_INT, "%s: %d (%s)\n", + qemu_log_mask(CPU_LOG_INT, "%s: exception: %d (%s)\n", __func__, exception, loongarch_exception_name(exception)); @@ -108,12 +131,15 @@ void loongarch_cpu_set_irq(void *opaque, int irq, int level) return; } - env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0); - - if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) { - cpu_interrupt(cs, CPU_INTERRUPT_HARD); - } else { - cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); + if (kvm_enabled()) { + kvm_loongarch_set_interrupt(cpu, irq, level); + } else if (tcg_enabled()) { + env->CSR_ESTAT = deposit64(env->CSR_ESTAT, irq, 1, level != 0); + if (FIELD_EX64(env->CSR_ESTAT, CSR_ESTAT, IS)) { + cpu_interrupt(cs, CPU_INTERRUPT_HARD); + } else { + cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); + } } } @@ -138,29 +164,26 @@ static inline bool cpu_loongarch_hw_interrupts_pending(CPULoongArchState *env) return (pending & status) != 0; } +#endif +#ifdef CONFIG_TCG +#ifndef CONFIG_USER_ONLY static void loongarch_cpu_do_interrupt(CPUState *cs) { LoongArchCPU *cpu = LOONGARCH_CPU(cs); CPULoongArchState *env = &cpu->env; bool update_badinstr = 1; int cause = -1; - const char *name; bool tlbfill = FIELD_EX64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR); uint32_t vec_size = FIELD_EX64(env->CSR_ECFG, CSR_ECFG, VS); if (cs->exception_index != EXCCODE_INT) { - if (cs->exception_index < 0 || - cs->exception_index >= ARRAY_SIZE(excp_names)) { - name = "unknown"; - } else { - name = excp_names[cs->exception_index]; - } - qemu_log_mask(CPU_LOG_INT, "%s enter: pc " TARGET_FMT_lx " ERA " TARGET_FMT_lx - " TLBRERA " TARGET_FMT_lx " %s exception\n", __func__, - env->pc, env->CSR_ERA, env->CSR_TLBRERA, name); + " TLBRERA " TARGET_FMT_lx " exception: %d (%s)\n", + __func__, env->pc, env->CSR_ERA, env->CSR_TLBRERA, + cs->exception_index, + loongarch_exception_name(cs->exception_index)); } switch (cs->exception_index) { @@ -320,7 +343,6 @@ static bool loongarch_cpu_exec_interrupt(CPUState *cs, int interrupt_request) } #endif -#ifdef CONFIG_TCG static void loongarch_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb) { @@ -407,6 +429,14 @@ static void loongarch_la464_initfn(Object *obj) data = FIELD_DP32(data, CPUCFG5, CC_DIV, 1); env->cpucfg[5] = data; + if (kvm_enabled()) { + data = 0; + data = FIELD_DP32(data, CPUCFG6, PMP, 1); + data = FIELD_DP32(data, CPUCFG6, PMNUM, 3); + data = FIELD_DP32(data, CPUCFG6, PMBITS, 63); + env->cpucfg[6] = data; + } + data = 0; data = FIELD_DP32(data, CPUCFG16, L1_IUPRE, 1); data = FIELD_DP32(data, CPUCFG16, L1_DPRE, 1); @@ -443,6 +473,18 @@ static void loongarch_la464_initfn(Object *obj) env->cpucfg[20] = data; env->CSR_ASID = FIELD_DP64(0, CSR_ASID, ASIDBITS, 0xa); + + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, SAVE_NUM, 8); + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, TIMER_BITS, 0x2f); + env->CSR_PRCFG1 = FIELD_DP64(env->CSR_PRCFG1, CSR_PRCFG1, VSMAX, 7); + + env->CSR_PRCFG2 = 0x3ffff000; + + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7); + env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8); + loongarch_cpu_post_init(obj); } @@ -507,17 +549,19 @@ static void loongarch_cpu_reset_hold(Object *obj) lacc->parent_phases.hold(obj); } +#ifdef CONFIG_TCG env->fcsr0_mask = FCSR0_M1 | FCSR0_M2 | FCSR0_M3; +#endif env->fcsr0 = 0x0; int n; - /* Set csr registers value after reset */ + /* Set csr registers value after reset, see the manual 6.4. */ env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PLV, 0); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, IE, 0); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DA, 1); env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, PG, 0); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 1); - env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 1); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATF, 0); + env->CSR_CRMD = FIELD_DP64(env->CSR_CRMD, CSR_CRMD, DATM, 0); env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, FPE, 0); env->CSR_EUEN = FIELD_DP64(env->CSR_EUEN, CSR_EUEN, SXE, 0); @@ -529,17 +573,28 @@ static void loongarch_cpu_reset_hold(Object *obj) env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, VS, 0); env->CSR_ECFG = FIELD_DP64(env->CSR_ECFG, CSR_ECFG, LIE, 0); - env->CSR_ESTAT = env->CSR_ESTAT & (~MAKE_64BIT_MASK(0, 2)); + env->CSR_ESTAT = 0; env->CSR_RVACFG = FIELD_DP64(env->CSR_RVACFG, CSR_RVACFG, RBITS, 0); + env->CSR_CPUID = cs->cpu_index; env->CSR_TCFG = FIELD_DP64(env->CSR_TCFG, CSR_TCFG, EN, 0); env->CSR_LLBCTL = FIELD_DP64(env->CSR_LLBCTL, CSR_LLBCTL, KLO, 0); env->CSR_TLBRERA = FIELD_DP64(env->CSR_TLBRERA, CSR_TLBRERA, ISTLBR, 0); env->CSR_MERRCTL = FIELD_DP64(env->CSR_MERRCTL, CSR_MERRCTL, ISMERR, 0); - - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, TLB_TYPE, 2); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, MTLB_ENTRY, 63); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_WAYS, 7); - env->CSR_PRCFG3 = FIELD_DP64(env->CSR_PRCFG3, CSR_PRCFG3, STLB_SETS, 8); + env->CSR_TID = cs->cpu_index; + /* + * Workaround for edk2-stable202408, CSR PGD register is set only if + * its value is equal to zero for boot cpu, it causes reboot issue. + * + * Here clear CSR registers relative with TLB. + */ + env->CSR_PGDH = 0; + env->CSR_PGDL = 0; + env->CSR_PWCL = 0; + env->CSR_PWCH = 0; + env->CSR_STLBPS = 0; + env->CSR_EENTRY = 0; + env->CSR_TLBRENTRY = 0; + env->CSR_MERRENTRY = 0; for (n = 0; n < 4; n++) { env->CSR_DMW[n] = FIELD_DP64(env->CSR_DMW[n], CSR_DMW, PLV0, 0); @@ -550,10 +605,17 @@ static void loongarch_cpu_reset_hold(Object *obj) #ifndef CONFIG_USER_ONLY env->pc = 0x1c000000; +#ifdef CONFIG_TCG memset(env->tlb, 0, sizeof(env->tlb)); #endif + if (kvm_enabled()) { + kvm_arch_reset_vcpu(cs); + } +#endif +#ifdef CONFIG_TCG restore_fp_status(env); +#endif cs->exception_index = -1; } @@ -576,53 +638,23 @@ static void loongarch_cpu_realizefn(DeviceState *dev, Error **errp) loongarch_cpu_register_gdb_regs_for_features(cs); - cpu_reset(cs); qemu_init_vcpu(cs); + cpu_reset(cs); lacc->parent_realize(dev, errp); } -#ifndef CONFIG_USER_ONLY -static void loongarch_qemu_write(void *opaque, hwaddr addr, - uint64_t val, unsigned size) -{ - qemu_log_mask(LOG_UNIMP, "[%s]: Unimplemented reg 0x%" HWADDR_PRIx "\n", - __func__, addr); -} - -static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, unsigned size) -{ - switch (addr) { - case VERSION_REG: - return 0x11ULL; - case FEATURE_REG: - return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI | - 1ULL << IOCSRF_CSRIPI; - case VENDOR_REG: - return 0x6e6f73676e6f6f4cULL; /* "Loongson" */ - case CPUNAME_REG: - return 0x303030354133ULL; /* "3A5000" */ - case MISC_FUNC_REG: - return 1ULL << IOCSRM_EXTIOI_EN; - } - return 0ULL; -} +static void loongarch_cpu_unrealizefn(DeviceState *dev) +{ + LoongArchCPUClass *mcc = LOONGARCH_CPU_GET_CLASS(dev); -static const MemoryRegionOps loongarch_qemu_ops = { - .read = loongarch_qemu_read, - .write = loongarch_qemu_write, - .endianness = DEVICE_LITTLE_ENDIAN, - .valid = { - .min_access_size = 4, - .max_access_size = 8, - }, - .impl = { - .min_access_size = 8, - .max_access_size = 8, - }, -}; +#ifndef CONFIG_USER_ONLY + cpu_remove_sync(CPU(dev)); #endif + mcc->parent_unrealize(dev); +} + static bool loongarch_get_lsx(Object *obj, Error **errp) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); @@ -675,17 +707,54 @@ static void loongarch_set_lasx(Object *obj, bool value, Error **errp) } } +static bool loongarch_get_lbt(Object *obj, Error **errp) +{ + return LOONGARCH_CPU(obj)->lbt != ON_OFF_AUTO_OFF; +} + +static void loongarch_set_lbt(Object *obj, bool value, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + + cpu->lbt = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; +} + +static bool loongarch_get_pmu(Object *obj, Error **errp) +{ + return LOONGARCH_CPU(obj)->pmu != ON_OFF_AUTO_OFF; +} + +static void loongarch_set_pmu(Object *obj, bool value, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(obj); + + cpu->pmu = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; +} + void loongarch_cpu_post_init(Object *obj) { LoongArchCPU *cpu = LOONGARCH_CPU(obj); - if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LSX)) { - object_property_add_bool(obj, "lsx", loongarch_get_lsx, - loongarch_set_lsx); - } - if (FIELD_EX32(cpu->env.cpucfg[2], CPUCFG2, LASX)) { - object_property_add_bool(obj, "lasx", loongarch_get_lasx, - loongarch_set_lasx); + object_property_add_bool(obj, "lsx", loongarch_get_lsx, + loongarch_set_lsx); + object_property_add_bool(obj, "lasx", loongarch_get_lasx, + loongarch_set_lasx); + + if (kvm_enabled()) { + cpu->lbt = ON_OFF_AUTO_AUTO; + object_property_add_bool(obj, "lbt", loongarch_get_lbt, + loongarch_set_lbt); + object_property_set_description(obj, "lbt", + "Set off to disable Binary Tranlation."); + + cpu->pmu = ON_OFF_AUTO_AUTO; + object_property_add_bool(obj, "pmu", loongarch_get_pmu, + loongarch_set_pmu); + object_property_set_description(obj, "pmu", + "Set off to performance monitor unit."); + + } else { + cpu->lbt = ON_OFF_AUTO_OFF; } } @@ -693,17 +762,12 @@ static void loongarch_cpu_init(Object *obj) { #ifndef CONFIG_USER_ONLY LoongArchCPU *cpu = LOONGARCH_CPU(obj); - CPULoongArchState *env = &cpu->env; qdev_init_gpio_in(DEVICE(cpu), loongarch_cpu_set_irq, N_IRQS); +#ifdef CONFIG_TCG timer_init_ns(&cpu->timer, QEMU_CLOCK_VIRTUAL, &loongarch_constant_timer_cb, cpu); - memory_region_init_io(&env->system_iocsr, OBJECT(cpu), NULL, - env, "iocsr", UINT64_MAX); - address_space_init(&env->address_space_iocsr, &env->system_iocsr, "IOCSR"); - memory_region_init_io(&env->iocsr_mem, OBJECT(cpu), &loongarch_qemu_ops, - NULL, "iocsr_misc", 0x428); - memory_region_add_subregion(&env->system_iocsr, 0, &env->iocsr_mem); +#endif #endif } @@ -734,8 +798,7 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) int i; qemu_fprintf(f, " PC=%016" PRIx64 " ", env->pc); - qemu_fprintf(f, " FCSR0 0x%08x fp_status 0x%02x\n", env->fcsr0, - get_float_exception_flags(&env->fp_status)); + qemu_fprintf(f, " FCSR0 0x%08x\n", env->fcsr0); /* gpr */ for (i = 0; i < 32; i++) { @@ -758,10 +821,12 @@ void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags) qemu_fprintf(f, "EENTRY=%016" PRIx64 "\n", env->CSR_EENTRY); qemu_fprintf(f, "PRCFG1=%016" PRIx64 ", PRCFG2=%016" PRIx64 "," " PRCFG3=%016" PRIx64 "\n", - env->CSR_PRCFG1, env->CSR_PRCFG3, env->CSR_PRCFG3); + env->CSR_PRCFG1, env->CSR_PRCFG2, env->CSR_PRCFG3); qemu_fprintf(f, "TLBRENTRY=%016" PRIx64 "\n", env->CSR_TLBRENTRY); qemu_fprintf(f, "TLBRBADV=%016" PRIx64 "\n", env->CSR_TLBRBADV); qemu_fprintf(f, "TLBRERA=%016" PRIx64 "\n", env->CSR_TLBRERA); + qemu_fprintf(f, "TCFG=%016" PRIx64 "\n", env->CSR_TCFG); + qemu_fprintf(f, "TVAL=%016" PRIx64 "\n", env->CSR_TVAL); /* fpr */ if (flags & CPU_DUMP_FPU) { @@ -795,6 +860,7 @@ static struct TCGCPUOps loongarch_tcg_ops = { #include "hw/core/sysemu-cpu-ops.h" static const struct SysemuCPUOps loongarch_sysemu_ops = { + .write_elf64_note = loongarch_cpu_write_elf64_note, .get_phys_page_debug = loongarch_cpu_get_phys_page_debug, }; @@ -806,6 +872,15 @@ static int64_t loongarch_cpu_get_arch_id(CPUState *cs) } #endif +static Property loongarch_cpu_properties[] = { + DEFINE_PROP_INT32("socket-id", LoongArchCPU, socket_id, 0), + DEFINE_PROP_INT32("core-id", LoongArchCPU, core_id, 0), + DEFINE_PROP_INT32("thread-id", LoongArchCPU, thread_id, 0), + DEFINE_PROP_INT32("node-id", LoongArchCPU, node_id, CPU_UNSET_NUMA_NODE_ID), + + DEFINE_PROP_END_OF_LIST() +}; + static void loongarch_cpu_class_init(ObjectClass *c, void *data) { LoongArchCPUClass *lacc = LOONGARCH_CPU_CLASS(c); @@ -813,8 +888,11 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data) DeviceClass *dc = DEVICE_CLASS(c); ResettableClass *rc = RESETTABLE_CLASS(c); + device_class_set_props(dc, loongarch_cpu_properties); device_class_set_parent_realize(dc, loongarch_cpu_realizefn, &lacc->parent_realize); + device_class_set_parent_unrealize(dc, loongarch_cpu_unrealizefn, + &lacc->parent_unrealize); resettable_class_set_parent_phases(rc, NULL, loongarch_cpu_reset_hold, NULL, &lacc->parent_phases); @@ -836,6 +914,7 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data) #ifdef CONFIG_TCG cc->tcg_ops = &loongarch_tcg_ops; #endif + dc->user_creatable = true; } static const gchar *loongarch32_gdb_arch_name(CPUState *cs) diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h index 00d1fba597f8eb791d99bc937115515e58eba610..6cc717c5eaffb1dec9bec4917acb66c55bb4524b 100644 --- a/target/loongarch/cpu.h +++ b/target/loongarch/cpu.h @@ -18,6 +18,7 @@ #endif #include "cpu-csr.h" #include "cpu-qom.h" +#include "qapi/qapi-types-common.h" #define IOCSRF_TEMP 0 #define IOCSRF_NODECNT 1 @@ -36,6 +37,7 @@ #define CPUNAME_REG 0x20 #define MISC_FUNC_REG 0x420 #define IOCSRM_EXTIOI_EN 48 +#define IOCSRM_EXTIOI_INT_ENCODE 49 #define IOCSR_MEM_SIZE 0x428 @@ -154,6 +156,7 @@ FIELD(CPUCFG2, LLFTP_VER, 15, 3) FIELD(CPUCFG2, LBT_X86, 18, 1) FIELD(CPUCFG2, LBT_ARM, 19, 1) FIELD(CPUCFG2, LBT_MIPS, 20, 1) +FIELD(CPUCFG2, LBT_ALL, 18, 3) FIELD(CPUCFG2, LSPW, 21, 1) FIELD(CPUCFG2, LAM, 22, 1) @@ -185,6 +188,8 @@ FIELD(CPUCFG6, PMNUM, 4, 4) FIELD(CPUCFG6, PMBITS, 8, 6) FIELD(CPUCFG6, UPM, 14, 1) +#define PMNUM_MAX 16 + /* cpucfg[16] bits */ FIELD(CPUCFG16, L1_IUPRE, 0, 1) FIELD(CPUCFG16, L1_IUUNIFY, 1, 1) @@ -272,6 +277,7 @@ union fpr_t { VReg vreg; }; +#ifdef CONFIG_TCG struct LoongArchTLB { uint64_t tlb_misc; /* Fields corresponding to CSR_TLBELO0/1 */ @@ -279,23 +285,35 @@ struct LoongArchTLB { uint64_t tlb_entry1; }; typedef struct LoongArchTLB LoongArchTLB; +#endif + +enum loongarch_features { + LOONGARCH_FEATURE_LBT, /* loongson binary translation extension */ + LOONGARCH_FEATURE_PMU, +}; + +typedef struct LoongArchBT { + /* scratch registers */ + uint64_t scr0; + uint64_t scr1; + uint64_t scr2; + uint64_t scr3; + /* loongarch eflags */ + uint32_t eflags; + uint32_t ftop; +} lbt_t; typedef struct CPUArchState { uint64_t gpr[32]; uint64_t pc; fpr_t fpr[32]; - float_status fp_status; bool cf[8]; - uint32_t fcsr0; - uint32_t fcsr0_mask; + lbt_t lbt; uint32_t cpucfg[21]; - uint64_t lladdr; /* LL virtual address compared against SC */ - uint64_t llval; - /* LoongArch CSRs */ uint64_t CSR_CRMD; uint64_t CSR_PRMD; @@ -319,6 +337,7 @@ typedef struct CPUArchState { uint64_t CSR_PWCH; uint64_t CSR_STLBPS; uint64_t CSR_RVACFG; + uint64_t CSR_CPUID; uint64_t CSR_PRCFG1; uint64_t CSR_PRCFG2; uint64_t CSR_PRCFG3; @@ -350,21 +369,41 @@ typedef struct CPUArchState { uint64_t CSR_DBG; uint64_t CSR_DERA; uint64_t CSR_DSAVE; - uint64_t CSR_CPUID; + struct { + uint64_t guest_addr; + } stealtime; +#ifdef CONFIG_TCG + float_status fp_status; + uint32_t fcsr0_mask; + uint64_t lladdr; /* LL virtual address compared against SC */ + uint64_t llval; +#endif #ifndef CONFIG_USER_ONLY +#ifdef CONFIG_TCG LoongArchTLB tlb[LOONGARCH_TLB_MAX]; +#endif - AddressSpace address_space_iocsr; - MemoryRegion system_iocsr; - MemoryRegion iocsr_mem; + AddressSpace *address_space_iocsr; bool load_elf; uint64_t elf_address; + uint32_t mp_state; /* Store ipistate to access from this struct */ DeviceState *ipistate; + + struct loongarch_boot_info *boot_info; #endif + struct { + uint64_t guest_addr; + } st; } CPULoongArchState; +typedef struct LoongArchCPUTopo { + int32_t socket_id; /* socket-id of this VCPU */ + int32_t core_id; /* core-id of this VCPU */ + int32_t thread_id; /* thread-id of this VCPU */ +} LoongArchCPUTopo; + /** * LoongArchCPU: * @env: #CPULoongArchState @@ -377,9 +416,18 @@ struct ArchCPU { CPULoongArchState env; QEMUTimer timer; uint32_t phy_id; + OnOffAuto lbt; + OnOffAuto pmu; + int32_t socket_id; /* socket-id of this VCPU */ + int32_t core_id; /* core-id of this VCPU */ + int32_t thread_id; /* thread-id of this VCPU */ + int32_t node_id; /* NUMA node this CPU belongs to */ /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; + /* used by KVM_REG_LOONGARCH_COUNTER ioctl to access guest time counters */ + uint64_t kvm_state_counter; + VMChangeStateEntry *vmsentry; }; /** @@ -393,6 +441,7 @@ struct LoongArchCPUClass { CPUClass parent_class; DeviceRealize parent_realize; + DeviceUnrealize parent_unrealize; ResettablePhases parent_phases; }; diff --git a/target/loongarch/cpu_helper.c b/target/loongarch/cpu_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..39037eecb4bd566b838e808ad2aa3875f1eebd02 --- /dev/null +++ b/target/loongarch/cpu_helper.c @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * LoongArch CPU helpers for qemu + * + * Copyright (c) 2024 Loongson Technology Corporation Limited + * + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "cpu-csr.h" + +#ifdef CONFIG_TCG +static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + int access_type, int index, int mmu_idx) +{ + LoongArchTLB *tlb = &env->tlb[index]; + uint64_t plv = mmu_idx; + uint64_t tlb_entry, tlb_ppn; + uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv; + + if (index >= LOONGARCH_STLB) { + tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); + } else { + tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); + } + n = (address >> tlb_ps) & 0x1;/* Odd or even */ + + tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0; + tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V); + tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D); + tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV); + if (is_la64(env)) { + tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN); + tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX); + tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR); + tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV); + } else { + tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN); + tlb_nx = 0; + tlb_nr = 0; + tlb_rplv = 0; + } + + /* Remove sw bit between bit12 -- bit PS*/ + tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1)); + + /* Check access rights */ + if (!tlb_v) { + return TLBRET_INVALID; + } + + if (access_type == MMU_INST_FETCH && tlb_nx) { + return TLBRET_XI; + } + + if (access_type == MMU_DATA_LOAD && tlb_nr) { + return TLBRET_RI; + } + + if (((tlb_rplv == 0) && (plv > tlb_plv)) || + ((tlb_rplv == 1) && (plv != tlb_plv))) { + return TLBRET_PE; + } + + if ((access_type == MMU_DATA_STORE) && !tlb_d) { + return TLBRET_DIRTY; + } + + *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) | + (address & MAKE_64BIT_MASK(0, tlb_ps)); + *prot = PAGE_READ; + if (tlb_d) { + *prot |= PAGE_WRITE; + } + if (!tlb_nx) { + *prot |= PAGE_EXEC; + } + return TLBRET_MATCH; +} + +/* + * One tlb entry holds an adjacent odd/even pair, the vpn is the + * content of the virtual page number divided by 2. So the + * compare vpn is bit[47:15] for 16KiB page. while the vppn + * field in tlb entry contains bit[47:13], so need adjust. + * virt_vpn = vaddr[47:13] + */ +bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, + int *index) +{ + LoongArchTLB *tlb; + uint16_t csr_asid, tlb_asid, stlb_idx; + uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps; + int i, compare_shift; + uint64_t vpn, tlb_vppn; + + csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID); + stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); + vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1); + stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */ + compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; + + /* Search STLB */ + for (i = 0; i < 8; ++i) { + tlb = &env->tlb[i * 256 + stlb_idx]; + tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); + if (tlb_e) { + tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); + tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); + tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); + + if ((tlb_g == 1 || tlb_asid == csr_asid) && + (vpn == (tlb_vppn >> compare_shift))) { + *index = i * 256 + stlb_idx; + return true; + } + } + } + + /* Search MTLB */ + for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) { + tlb = &env->tlb[i]; + tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); + if (tlb_e) { + tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); + tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); + tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); + tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); + compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; + vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1); + if ((tlb_g == 1 || tlb_asid == csr_asid) && + (vpn == (tlb_vppn >> compare_shift))) { + *index = i; + return true; + } + } + } + return false; +} + +static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + int index, match; + + match = loongarch_tlb_search(env, address, &index); + if (match) { + return loongarch_map_tlb_entry(env, physical, prot, + address, access_type, index, mmu_idx); + } + + return TLBRET_NOMATCH; +} +#else +static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + return TLBRET_NOMATCH; +} +#endif + +static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va, + target_ulong dmw) +{ + if (is_la64(env)) { + return va & TARGET_VIRT_MASK; + } else { + uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG); + return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \ + (pseg << R_CSR_DMW_32_VSEG_SHIFT); + } +} + +int get_physical_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx) +{ + int user_mode = mmu_idx == MMU_IDX_USER; + int kernel_mode = mmu_idx == MMU_IDX_KERNEL; + uint32_t plv, base_c, base_v; + int64_t addr_high; + uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA); + uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG); + + /* Check PG and DA */ + if (da & !pg) { + *physical = address & TARGET_PHYS_MASK; + *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + return TLBRET_MATCH; + } + + plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT); + if (is_la64(env)) { + base_v = address >> R_CSR_DMW_64_VSEG_SHIFT; + } else { + base_v = address >> R_CSR_DMW_32_VSEG_SHIFT; + } + /* Check direct map window */ + for (int i = 0; i < 4; i++) { + if (is_la64(env)) { + base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG); + } else { + base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG); + } + if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) { + *physical = dmw_va2pa(env, address, env->CSR_DMW[i]); + *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + return TLBRET_MATCH; + } + } + + /* Check valid extension */ + addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16); + if (!(addr_high == 0 || addr_high == -1)) { + return TLBRET_BADADDR; + } + + /* Mapped address */ + return loongarch_map_address(env, physical, prot, address, + access_type, mmu_idx); +} + +hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + hwaddr phys_addr; + int prot; + + if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD, + cpu_mmu_index(env, false)) != 0) { + return -1; + } + return phys_addr; +} diff --git a/target/loongarch/gdbstub.c b/target/loongarch/gdbstub.c index 5fc2f19e965e101a269c923a101c9be05d2bd2ac..cc72680c38b282bdcc2889537fa902483510089b 100644 --- a/target/loongarch/gdbstub.c +++ b/target/loongarch/gdbstub.c @@ -33,28 +33,29 @@ void write_fcc(CPULoongArchState *env, uint64_t val) int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n) { - LoongArchCPU *cpu = LOONGARCH_CPU(cs); - CPULoongArchState *env = &cpu->env; - uint64_t val; - - if (0 <= n && n < 32) { - val = env->gpr[n]; - } else if (n == 32) { - /* orig_a0 */ - val = 0; - } else if (n == 33) { - val = env->pc; - } else if (n == 34) { - val = env->CSR_BADV; - } + CPULoongArchState *env = cpu_env(cs); if (0 <= n && n <= 34) { + uint64_t val; + + if (n < 32) { + val = env->gpr[n]; + } else if (n == 32) { + /* orig_a0 */ + val = 0; + } else if (n == 33) { + val = env->pc; + } else /* if (n == 34) */ { + val = env->CSR_BADV; + } + if (is_la64(env)) { return gdb_get_reg64(mem_buf, val); } else { return gdb_get_reg32(mem_buf, val); } } + return 0; } @@ -67,10 +68,10 @@ int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) int length = 0; if (is_la64(env)) { - tmp = ldq_p(mem_buf); + tmp = ldq_le_p(mem_buf); read_length = 8; } else { - tmp = ldl_p(mem_buf); + tmp = ldl_le_p(mem_buf); read_length = 4; } @@ -103,13 +104,13 @@ static int loongarch_gdb_set_fpu(CPULoongArchState *env, int length = 0; if (0 <= n && n < 32) { - env->fpr[n].vreg.D(0) = ldq_p(mem_buf); + env->fpr[n].vreg.D(0) = ldq_le_p(mem_buf); length = 8; } else if (32 <= n && n < 40) { env->cf[n - 32] = ldub_p(mem_buf); length = 1; } else if (n == 40) { - env->fcsr0 = ldl_p(mem_buf); + env->fcsr0 = ldl_le_p(mem_buf); length = 4; } return length; diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h index c492863cc5defa89d0392de53355e2cc4d0f6922..1a02427627d8e968a1b829bdb7915a212860856e 100644 --- a/target/loongarch/internals.h +++ b/target/loongarch/internals.h @@ -16,11 +16,6 @@ #define TARGET_PHYS_MASK MAKE_64BIT_MASK(0, TARGET_PHYS_ADDR_SPACE_BITS) #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS) -/* Global bit used for lddir/ldpte */ -#define LOONGARCH_PAGE_HUGE_SHIFT 6 -/* Global bit for huge page */ -#define LOONGARCH_HGLOBAL_SHIFT 12 - void loongarch_translate_init(void); void loongarch_cpu_dump_state(CPUState *cpu, FILE *f, int flags); @@ -31,10 +26,23 @@ void G_NORETURN do_raise_exception(CPULoongArchState *env, const char *loongarch_exception_name(int32_t exception); +#ifdef CONFIG_TCG int ieee_ex_to_loongarch(int xcpt); void restore_fp_status(CPULoongArchState *env); +#endif #ifndef CONFIG_USER_ONLY +enum { + TLBRET_MATCH = 0, + TLBRET_BADADDR = 1, + TLBRET_NOMATCH = 2, + TLBRET_INVALID = 3, + TLBRET_DIRTY = 4, + TLBRET_RI = 5, + TLBRET_XI = 6, + TLBRET_PE = 7, +}; + extern const VMStateDescription vmstate_loongarch_cpu; void loongarch_cpu_set_irq(void *opaque, int irq, int level); @@ -44,12 +52,18 @@ uint64_t cpu_loongarch_get_constant_timer_counter(LoongArchCPU *cpu); uint64_t cpu_loongarch_get_constant_timer_ticks(LoongArchCPU *cpu); void cpu_loongarch_store_constant_timer_config(LoongArchCPU *cpu, uint64_t value); +bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, + int *index); +int get_physical_address(CPULoongArchState *env, hwaddr *physical, + int *prot, target_ulong address, + MMUAccessType access_type, int mmu_idx); +hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); +#ifdef CONFIG_TCG bool loongarch_cpu_tlb_fill(CPUState *cs, vaddr address, int size, MMUAccessType access_type, int mmu_idx, bool probe, uintptr_t retaddr); - -hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); +#endif #endif /* !CONFIG_USER_ONLY */ uint64_t read_fcc(CPULoongArchState *env); @@ -58,5 +72,7 @@ void write_fcc(CPULoongArchState *env, uint64_t val); int loongarch_cpu_gdb_read_register(CPUState *cs, GByteArray *mem_buf, int n); int loongarch_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n); void loongarch_cpu_register_gdb_regs_for_features(CPUState *cs); +int loongarch_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cpu, + int cpuid, DumpState *s); #endif diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c new file mode 100644 index 0000000000000000000000000000000000000000..f724e77a1bb1d1615453ab0a7b24f9096a4272c2 --- /dev/null +++ b/target/loongarch/kvm/kvm.c @@ -0,0 +1,1121 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * QEMU LoongArch KVM + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "qemu/osdep.h" +#include +#include + +#include "qapi/error.h" +#include "qemu/timer.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "sysemu/sysemu.h" +#include "sysemu/kvm.h" +#include "sysemu/kvm_int.h" +#include "hw/pci/pci.h" +#include "exec/memattrs.h" +#include "exec/address-spaces.h" +#include "hw/boards.h" +#include "hw/irq.h" +#include "qemu/log.h" +#include "hw/loader.h" +#include "migration/migration.h" +#include "sysemu/runstate.h" +#include "cpu-csr.h" +#include "kvm_loongarch.h" +#include "trace.h" + +static bool cap_has_mp_state; +static unsigned int brk_insn; +const KVMCapabilityInfo kvm_arch_required_capabilities[] = { + KVM_CAP_LAST_INFO +}; + +static int kvm_get_stealtime(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->stealtime.guest_addr, + }; + + err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr); + if (err) { + return 0; + } + + err = kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, attr); + if (err) { + error_report("PVTIME: KVM_GET_DEVICE_ATTR: %s", strerror(errno)); + return err; + } + + return 0; +} + +static int kvm_set_stealtime(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + int err; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_PVTIME_CTRL, + .attr = KVM_LOONGARCH_VCPU_PVTIME_GPA, + .addr = (uint64_t)&env->stealtime.guest_addr, + }; + + err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr); + if (err) { + return 0; + } + + err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr); + if (err) { + error_report("PVTIME: KVM_SET_DEVICE_ATTR %s with gpa "TARGET_FMT_lx, + strerror(errno), env->stealtime.guest_addr); + return err; + } + + return 0; +} + +static int kvm_loongarch_get_regs_core(CPUState *cs) +{ + int ret = 0; + int i; + struct kvm_regs regs; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + /* Get the current register set as KVM seems it */ + ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, ®s); + if (ret < 0) { + trace_kvm_failed_get_regs_core(strerror(errno)); + return ret; + } + /* gpr[0] value is always 0 */ + env->gpr[0] = 0; + for (i = 1; i < 32; i++) { + env->gpr[i] = regs.gpr[i]; + } + + env->pc = regs.pc; + return ret; +} + +static int kvm_loongarch_put_regs_core(CPUState *cs) +{ + int ret = 0; + int i; + struct kvm_regs regs; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + /* Set the registers based on QEMU's view of things */ + for (i = 0; i < 32; i++) { + regs.gpr[i] = env->gpr[i]; + } + + regs.pc = env->pc; + ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, ®s); + if (ret < 0) { + trace_kvm_failed_put_regs_core(strerror(errno)); + } + + return ret; +} + +static int kvm_loongarch_get_csr(CPUState *cs) +{ + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD), + &env->CSR_CRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD), + &env->CSR_PRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN), + &env->CSR_EUEN); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC), + &env->CSR_MISC); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG), + &env->CSR_ECFG); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT), + &env->CSR_ESTAT); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA), + &env->CSR_ERA); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV), + &env->CSR_BADV); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI), + &env->CSR_BADI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY), + &env->CSR_EENTRY); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX), + &env->CSR_TLBIDX); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI), + &env->CSR_TLBEHI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0), + &env->CSR_TLBELO0); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1), + &env->CSR_TLBELO1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID), + &env->CSR_ASID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL), + &env->CSR_PGDL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH), + &env->CSR_PGDH); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD), + &env->CSR_PGD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL), + &env->CSR_PWCL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH), + &env->CSR_PWCH); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS), + &env->CSR_STLBPS); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG), + &env->CSR_RVACFG); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), + &env->CSR_CPUID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1), + &env->CSR_PRCFG1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2), + &env->CSR_PRCFG2); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3), + &env->CSR_PRCFG3); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)), + &env->CSR_SAVE[0]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)), + &env->CSR_SAVE[1]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)), + &env->CSR_SAVE[2]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)), + &env->CSR_SAVE[3]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)), + &env->CSR_SAVE[4]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)), + &env->CSR_SAVE[5]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)), + &env->CSR_SAVE[6]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)), + &env->CSR_SAVE[7]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID), + &env->CSR_TID); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC), + &env->CSR_CNTC); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR), + &env->CSR_TICLR); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL), + &env->CSR_LLBCTL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1), + &env->CSR_IMPCTL1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2), + &env->CSR_IMPCTL2); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY), + &env->CSR_TLBRENTRY); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV), + &env->CSR_TLBRBADV); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA), + &env->CSR_TLBRERA); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE), + &env->CSR_TLBRSAVE); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0), + &env->CSR_TLBRELO0); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1), + &env->CSR_TLBRELO1); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI), + &env->CSR_TLBREHI); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD), + &env->CSR_TLBRPRMD); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)), + &env->CSR_DMW[0]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)), + &env->CSR_DMW[1]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)), + &env->CSR_DMW[2]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)), + &env->CSR_DMW[3]); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL), + &env->CSR_TVAL); + + ret |= kvm_get_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG), + &env->CSR_TCFG); + + return ret; +} + +static int kvm_loongarch_put_csr(CPUState *cs, int level) +{ + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CRMD), + &env->CSR_CRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRMD), + &env->CSR_PRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EUEN), + &env->CSR_EUEN); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_MISC), + &env->CSR_MISC); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ECFG), + &env->CSR_ECFG); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ESTAT), + &env->CSR_ESTAT); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ERA), + &env->CSR_ERA); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADV), + &env->CSR_BADV); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_BADI), + &env->CSR_BADI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_EENTRY), + &env->CSR_EENTRY); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBIDX), + &env->CSR_TLBIDX); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBEHI), + &env->CSR_TLBEHI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO0), + &env->CSR_TLBELO0); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBELO1), + &env->CSR_TLBELO1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_ASID), + &env->CSR_ASID); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDL), + &env->CSR_PGDL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGDH), + &env->CSR_PGDH); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PGD), + &env->CSR_PGD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCL), + &env->CSR_PWCL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PWCH), + &env->CSR_PWCH); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_STLBPS), + &env->CSR_STLBPS); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_RVACFG), + &env->CSR_RVACFG); + + /* CPUID is constant after poweron, it should be set only once */ + if (level >= KVM_PUT_FULL_STATE) { + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CPUID), + &env->CSR_CPUID); + } + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG1), + &env->CSR_PRCFG1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG2), + &env->CSR_PRCFG2); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_PRCFG3), + &env->CSR_PRCFG3); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(0)), + &env->CSR_SAVE[0]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(1)), + &env->CSR_SAVE[1]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(2)), + &env->CSR_SAVE[2]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(3)), + &env->CSR_SAVE[3]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(4)), + &env->CSR_SAVE[4]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(5)), + &env->CSR_SAVE[5]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(6)), + &env->CSR_SAVE[6]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_SAVE(7)), + &env->CSR_SAVE[7]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TID), + &env->CSR_TID); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_CNTC), + &env->CSR_CNTC); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TICLR), + &env->CSR_TICLR); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_LLBCTL), + &env->CSR_LLBCTL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL1), + &env->CSR_IMPCTL1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_IMPCTL2), + &env->CSR_IMPCTL2); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRENTRY), + &env->CSR_TLBRENTRY); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRBADV), + &env->CSR_TLBRBADV); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRERA), + &env->CSR_TLBRERA); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRSAVE), + &env->CSR_TLBRSAVE); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO0), + &env->CSR_TLBRELO0); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRELO1), + &env->CSR_TLBRELO1); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBREHI), + &env->CSR_TLBREHI); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TLBRPRMD), + &env->CSR_TLBRPRMD); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(0)), + &env->CSR_DMW[0]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(1)), + &env->CSR_DMW[1]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(2)), + &env->CSR_DMW[2]); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_DMW(3)), + &env->CSR_DMW[3]); + /* + * timer cfg must be put at last since it is used to enable + * guest timer + */ + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TVAL), + &env->CSR_TVAL); + + ret |= kvm_set_one_reg(cs, KVM_IOC_CSRID(LOONGARCH_CSR_TCFG), + &env->CSR_TCFG); + return ret; +} + +static int kvm_loongarch_get_regs_fp(CPUState *cs) +{ + int ret, i; + struct kvm_fpu fpu; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret = kvm_vcpu_ioctl(cs, KVM_GET_FPU, &fpu); + if (ret < 0) { + trace_kvm_failed_get_fpu(strerror(errno)); + return ret; + } + + env->fcsr0 = fpu.fcsr; + for (i = 0; i < 32; i++) { + env->fpr[i].vreg.UD[0] = fpu.fpr[i].val64[0]; + env->fpr[i].vreg.UD[1] = fpu.fpr[i].val64[1]; + env->fpr[i].vreg.UD[2] = fpu.fpr[i].val64[2]; + env->fpr[i].vreg.UD[3] = fpu.fpr[i].val64[3]; + } + for (i = 0; i < 8; i++) { + env->cf[i] = fpu.fcc & 0xFF; + fpu.fcc = fpu.fcc >> 8; + } + + return ret; +} + +static int kvm_loongarch_put_regs_fp(CPUState *cs) +{ + int ret, i; + struct kvm_fpu fpu; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + fpu.fcsr = env->fcsr0; + fpu.fcc = 0; + for (i = 0; i < 32; i++) { + fpu.fpr[i].val64[0] = env->fpr[i].vreg.UD[0]; + fpu.fpr[i].val64[1] = env->fpr[i].vreg.UD[1]; + fpu.fpr[i].val64[2] = env->fpr[i].vreg.UD[2]; + fpu.fpr[i].val64[3] = env->fpr[i].vreg.UD[3]; + } + + for (i = 0; i < 8; i++) { + fpu.fcc |= env->cf[i] << (8 * i); + } + + ret = kvm_vcpu_ioctl(cs, KVM_SET_FPU, &fpu); + if (ret < 0) { + trace_kvm_failed_put_fpu(strerror(errno)); + } + + return ret; +} + +static int kvm_loongarch_put_lbt(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + uint64_t val; + int ret; + + /* check whether vm support LBT firstly */ + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) { + return 0; + } + + /* set six LBT registers including scr0-scr3, eflags, ftop */ + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2); + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3); + /* + * Be cautious, KVM_REG_LOONGARCH_LBT_FTOP is defined as 64-bit however + * lbt.ftop is 32-bit; the same with KVM_REG_LOONGARCH_LBT_EFLAGS register + */ + val = env->lbt.eflags; + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val); + val = env->lbt.ftop; + ret |= kvm_set_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val); + + return ret; +} + +static int kvm_loongarch_get_lbt(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + uint64_t val; + int ret; + + /* check whether vm support LBT firstly */ + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LBT_ALL) != 7) { + return 0; + } + + /* get six LBT registers including scr0-scr3, eflags, ftop */ + ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR0, &env->lbt.scr0); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR1, &env->lbt.scr1); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR2, &env->lbt.scr2); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_SCR3, &env->lbt.scr3); + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_EFLAGS, &val); + env->lbt.eflags = (uint32_t)val; + ret |= kvm_get_one_reg(cs, KVM_REG_LOONGARCH_LBT_FTOP, &val); + env->lbt.ftop = (uint32_t)val; + + return ret; +} + +void kvm_arch_reset_vcpu(CPUState *cs) +{ + CPULoongArchState *env = cpu_env(cs); + int ret = 0; + uint64_t unused = 0; + + env->mp_state = KVM_MP_STATE_RUNNABLE; + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_VCPU_RESET, &unused); + if (ret) { + error_report("Failed to set KVM_REG_LOONGARCH_VCPU_RESET: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } +} + +static int kvm_loongarch_get_mpstate(CPUState *cs) +{ + int ret = 0; + struct kvm_mp_state mp_state; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + if (cap_has_mp_state) { + ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state); + if (ret) { + trace_kvm_failed_get_mpstate(strerror(errno)); + return ret; + } + env->mp_state = mp_state.mp_state; + } + + return ret; +} + +static int kvm_loongarch_put_mpstate(CPUState *cs) +{ + int ret = 0; + + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + struct kvm_mp_state mp_state = { + .mp_state = env->mp_state + }; + + if (cap_has_mp_state) { + ret = kvm_vcpu_ioctl(cs, KVM_SET_MP_STATE, &mp_state); + if (ret) { + trace_kvm_failed_put_mpstate(strerror(errno)); + } + } + + return ret; +} + +static int kvm_loongarch_get_cpucfg(CPUState *cs) +{ + int i, ret = 0; + uint64_t val; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + for (i = 0; i < 21; i++) { + ret = kvm_get_one_reg(cs, KVM_IOC_CPUCFG(i), &val); + if (ret < 0) { + trace_kvm_failed_get_cpucfg(strerror(errno)); + } + env->cpucfg[i] = (uint32_t)val; + } + return ret; +} + +static int kvm_check_cpucfg2(CPUState *cs) +{ + int ret; + uint64_t val; + struct kvm_device_attr attr = { + .group = KVM_LOONGARCH_VCPU_CPUCFG, + .attr = 2, + .addr = (uint64_t)&val, + }; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + ret = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, &attr); + + if (!ret) { + kvm_vcpu_ioctl(cs, KVM_GET_DEVICE_ATTR, &attr); + env->cpucfg[2] &= val; + + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, FP)) { + /* The FP minimal version is 1. */ + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, FP_VER, 1); + } + + if (FIELD_EX32(env->cpucfg[2], CPUCFG2, LLFTP)) { + /* The LLFTP minimal version is 1. */ + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LLFTP_VER, 1); + } + } + + return ret; +} + +static int kvm_loongarch_put_cpucfg(CPUState *cs) +{ + int i, ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + uint64_t val; + + for (i = 0; i < 21; i++) { + if (i == 2) { + ret = kvm_check_cpucfg2(cs); + if (ret) { + return ret; + } + } + val = env->cpucfg[i]; + ret = kvm_set_one_reg(cs, KVM_IOC_CPUCFG(i), &val); + if (ret < 0) { + trace_kvm_failed_put_cpucfg(strerror(errno)); + } + } + return ret; +} + +int kvm_arch_get_registers(CPUState *cs) +{ + int ret; + + ret = kvm_loongarch_get_regs_core(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_cpucfg(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_csr(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_regs_fp(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_lbt(cs); + if (ret) { + return ret; + } + + ret = kvm_get_stealtime(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_get_mpstate(cs); + return ret; +} + +int kvm_arch_put_registers(CPUState *cs, int level) +{ + int ret; + + ret = kvm_loongarch_put_regs_core(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_cpucfg(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_csr(cs, level); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_regs_fp(cs); + if (ret) { + return ret; + } + + ret = kvm_loongarch_put_lbt(cs); + if (ret) { + return ret; + } + + if (level >= KVM_PUT_FULL_STATE) { + /* + * only KVM_PUT_FULL_STATE is required, kvm kernel will clear + * guest_addr for KVM_PUT_RESET_STATE + */ + ret = kvm_set_stealtime(cs); + if (ret) { + return ret; + } + } + + ret = kvm_loongarch_put_mpstate(cs); + return ret; +} + +static void kvm_loongarch_vm_stage_change(void *opaque, bool running, + RunState state) +{ + int ret; + CPUState *cs = opaque; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + + if (running) { + ret = kvm_set_one_reg(cs, KVM_REG_LOONGARCH_COUNTER, + &cpu->kvm_state_counter); + if (ret < 0) { + trace_kvm_failed_put_counter(strerror(errno)); + } + } else { + ret = kvm_get_one_reg(cs, KVM_REG_LOONGARCH_COUNTER, + &cpu->kvm_state_counter); + if (ret < 0) { + trace_kvm_failed_get_counter(strerror(errno)); + } + } +} + +static bool kvm_feature_supported(CPUState *cs, enum loongarch_features feature) +{ + int ret; + struct kvm_device_attr attr; + + switch (feature) { + case LOONGARCH_FEATURE_LBT: + /* + * Return all if all the LBT features are supported such as: + * KVM_LOONGARCH_VM_FEAT_X86BT + * KVM_LOONGARCH_VM_FEAT_ARMBT + * KVM_LOONGARCH_VM_FEAT_MIPSBT + */ + attr.group = KVM_LOONGARCH_VM_FEAT_CTRL; + attr.attr = KVM_LOONGARCH_VM_FEAT_X86BT; + ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + attr.attr = KVM_LOONGARCH_VM_FEAT_ARMBT; + ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + attr.attr = KVM_LOONGARCH_VM_FEAT_MIPSBT; + ret |= kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + return (ret == 0); + + case LOONGARCH_FEATURE_PMU: + attr.group = KVM_LOONGARCH_VM_FEAT_CTRL; + attr.attr = KVM_LOONGARCH_VM_FEAT_PMU; + ret = kvm_vm_ioctl(kvm_state, KVM_HAS_DEVICE_ATTR, &attr); + return (ret == 0); + + default: + return false; + } + + return false; +} + +static int kvm_cpu_check_lbt(CPUState *cs, Error **errp) +{ + CPULoongArchState *env = cpu_env(cs); + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + bool kvm_supported; + + kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_LBT); + if (cpu->lbt == ON_OFF_AUTO_ON) { + if (kvm_supported) { + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7); + } else { + error_setg(errp, "'lbt' feature not supported by KVM on this host"); + return -ENOTSUP; + } + } else if ((cpu->lbt == ON_OFF_AUTO_AUTO) && kvm_supported) { + env->cpucfg[2] = FIELD_DP32(env->cpucfg[2], CPUCFG2, LBT_ALL, 7); + } + + return 0; +} + +static int kvm_cpu_check_pmu(CPUState *cs, Error **errp) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = cpu_env(cs); + bool kvm_supported; + + kvm_supported = kvm_feature_supported(cs, LOONGARCH_FEATURE_PMU); + if (cpu->pmu == ON_OFF_AUTO_ON) { + if (!kvm_supported) { + error_setg(errp, "'pmu' feature not supported by KVM on the host"); + return -ENOTSUP; + } + } else if (cpu->pmu != ON_OFF_AUTO_AUTO) { + /* disable pmu if ON_OFF_AUTO_OFF is set */ + kvm_supported = false; + } + + if (kvm_supported) { + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMP, 1); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMNUM, 3); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, PMBITS, 63); + env->cpucfg[6] = FIELD_DP32(env->cpucfg[6], CPUCFG6, UPM, 1); + } + return 0; +} + +int kvm_arch_init_vcpu(CPUState *cs) +{ + uint64_t val; + int ret; + Error *local_err = NULL; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + + ret = 0; + cpu->vmsentry = qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs); + + if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, &val)) { + brk_insn = val; + } + + ret = kvm_cpu_check_lbt(cs, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + + ret = kvm_cpu_check_pmu(cs, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + + return ret; +} + +int kvm_arch_destroy_vcpu(CPUState *cs) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + qemu_del_vm_change_state_handler(cpu->vmsentry); + return 0; +} + +unsigned long kvm_arch_vcpu_id(CPUState *cs) +{ + return cs->cpu_index; +} + +int kvm_arch_release_virq_post(int virq) +{ + return 0; +} + +int kvm_arch_msi_data_to_gsi(uint32_t data) +{ + abort(); +} + +int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, + uint64_t address, uint32_t data, PCIDevice *dev) +{ + return 0; +} + +int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, + int vector, PCIDevice *dev) +{ + return 0; +} + +void kvm_arch_init_irq_routing(KVMState *s) +{ +} + +int kvm_arch_get_default_type(MachineState *ms) +{ + return 0; +} + +int kvm_arch_init(MachineState *ms, KVMState *s) +{ + cap_has_mp_state = kvm_check_extension(s, KVM_CAP_MP_STATE); + return 0; +} + +int kvm_arch_irqchip_create(KVMState *s) +{ + return 0; +} + +void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) +{ +} + +MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) +{ + return MEMTXATTRS_UNSPECIFIED; +} + +int kvm_arch_process_async_events(CPUState *cs) +{ + return cs->halted; +} + +bool kvm_arch_stop_on_emulation_error(CPUState *cs) +{ + return true; +} + +bool kvm_arch_cpu_check_are_resettable(void) +{ + return true; +} + + +void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) +{ + if (kvm_sw_breakpoints_active(cpu)) { + dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; + } +} + +int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) +{ + if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) || + cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) { + error_report("%s failed", __func__); + return -EINVAL; + } + return 0; +} + +int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) +{ + static uint32_t brk; + + if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) || + brk != brk_insn || + cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) { + error_report("%s failed", __func__); + return -EINVAL; + } + return 0; +} + +int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type) +{ + return -ENOSYS; +} + +int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type) +{ + return -ENOSYS; +} + +void kvm_arch_remove_all_hw_breakpoints(void) +{ +} + +static bool kvm_loongarch_handle_debug(CPUState *cs, struct kvm_run *run) +{ + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + + kvm_cpu_synchronize_state(cs); + if (cs->singlestep_enabled) { + return true; + } + + if (kvm_find_sw_breakpoint(cs, env->pc)) { + return true; + } + + return false; +} + +int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) +{ + int ret = 0; + LoongArchCPU *cpu = LOONGARCH_CPU(cs); + CPULoongArchState *env = &cpu->env; + MemTxAttrs attrs = {}; + + attrs.requester_id = env_cpu(env)->cpu_index; + + trace_kvm_arch_handle_exit(run->exit_reason); + switch (run->exit_reason) { + case KVM_EXIT_LOONGARCH_IOCSR: + address_space_rw(env->address_space_iocsr, + run->iocsr_io.phys_addr, + attrs, + run->iocsr_io.data, + run->iocsr_io.len, + run->iocsr_io.is_write); + break; + + case KVM_EXIT_DEBUG: + if (kvm_loongarch_handle_debug(cs, run)) { + ret = EXCP_DEBUG; + } + break; + + default: + ret = -1; + warn_report("KVM: unknown exit reason %d", run->exit_reason); + break; + } + return ret; +} + +int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level) +{ + struct kvm_interrupt intr; + CPUState *cs = CPU(cpu); + + if (level) { + intr.irq = irq; + } else { + intr.irq = -irq; + } + + trace_kvm_set_intr(irq, level); + return kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &intr); +} + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/loongarch/kvm/kvm_loongarch.h b/target/loongarch/kvm/kvm_loongarch.h new file mode 100644 index 0000000000000000000000000000000000000000..1051a341ec2633647e55f3c37ad5998c4f02ba06 --- /dev/null +++ b/target/loongarch/kvm/kvm_loongarch.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * QEMU LoongArch kvm interface + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + */ + +#include "cpu.h" + +#ifndef QEMU_KVM_LOONGARCH_H +#define QEMU_KVM_LOONGARCH_H + +int kvm_loongarch_set_interrupt(LoongArchCPU *cpu, int irq, int level); +void kvm_arch_reset_vcpu(CPUState *cs); + +#endif diff --git a/target/loongarch/kvm/meson.build b/target/loongarch/kvm/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..2266de6ca97c3122bbf437a129fae97771010529 --- /dev/null +++ b/target/loongarch/kvm/meson.build @@ -0,0 +1 @@ +loongarch_ss.add(when: 'CONFIG_KVM', if_true: files('kvm.c')) diff --git a/target/loongarch/loongarch-qmp-cmds.c b/target/loongarch/loongarch-qmp-cmds.c index 645672ff593fb202216ac135ad14dfc148aa936c..dc78a3ffa22b90f73f695f24b2d22b8f5221f13e 100644 --- a/target/loongarch/loongarch-qmp-cmds.c +++ b/target/loongarch/loongarch-qmp-cmds.c @@ -42,7 +42,7 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp) } static const char *cpu_model_advertised_features[] = { - "lsx", "lasx", NULL + "lsx", "lasx", "lbt", "pmu", NULL }; CpuModelExpansionInfo *qmp_query_cpu_model_expansion(CpuModelExpansionType type, diff --git a/target/loongarch/machine.c b/target/loongarch/machine.c index 1c4e01d07695ac67e22fa77166d48dabd7959da2..57abdddc09d4f129a1a9dd89e73e8eed340a2129 100644 --- a/target/loongarch/machine.c +++ b/target/loongarch/machine.c @@ -8,7 +8,10 @@ #include "qemu/osdep.h" #include "cpu.h" #include "migration/cpu.h" +#include "sysemu/tcg.h" #include "vec.h" +#include "kvm/kvm_loongarch.h" +#include "sysemu/kvm.h" static const VMStateDescription vmstate_fpu_reg = { .name = "fpu_reg", @@ -109,9 +112,38 @@ static const VMStateDescription vmstate_lasx = { }, }; +static bool lbt_needed(void *opaque) +{ + LoongArchCPU *cpu = opaque; + + return !!FIELD_EX64(cpu->env.cpucfg[2], CPUCFG2, LBT_ALL); +} + +static const VMStateDescription vmstate_lbt = { + .name = "cpu/lbt", + .version_id = 0, + .minimum_version_id = 0, + .needed = lbt_needed, + .fields = (const VMStateField[]) { + VMSTATE_UINT64(env.lbt.scr0, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr1, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr2, LoongArchCPU), + VMSTATE_UINT64(env.lbt.scr3, LoongArchCPU), + VMSTATE_UINT32(env.lbt.eflags, LoongArchCPU), + VMSTATE_UINT32(env.lbt.ftop, LoongArchCPU), + VMSTATE_END_OF_LIST() + }, +}; + +#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) +static bool tlb_needed(void *opaque) +{ + return tcg_enabled(); +} + /* TLB state */ -const VMStateDescription vmstate_tlb = { - .name = "cpu/tlb", +static const VMStateDescription vmstate_tlb_entry = { + .name = "cpu/tlb_entry", .version_id = 0, .minimum_version_id = 0, .fields = (VMStateField[]) { @@ -122,15 +154,31 @@ const VMStateDescription vmstate_tlb = { } }; +static const VMStateDescription vmstate_tlb = { + .name = "cpu/tlb", + .version_id = 0, + .minimum_version_id = 0, + .needed = tlb_needed, + .fields = (const VMStateField[]) { + VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX, + 0, vmstate_tlb_entry, LoongArchTLB), + VMSTATE_END_OF_LIST() + } +}; +#endif + /* LoongArch CPU state */ const VMStateDescription vmstate_loongarch_cpu = { .name = "cpu", - .version_id = 1, - .minimum_version_id = 1, - .fields = (VMStateField[]) { + .version_id = 3, + .minimum_version_id = 3, + .fields = (const VMStateField[]) { VMSTATE_UINTTL_ARRAY(env.gpr, LoongArchCPU, 32), VMSTATE_UINTTL(env.pc, LoongArchCPU), + /* PV time */ + VMSTATE_UINT64(env.st.guest_addr, LoongArchCPU), + /* Remaining CSRs */ VMSTATE_UINT64(env.CSR_CRMD, LoongArchCPU), VMSTATE_UINT64(env.CSR_PRMD, LoongArchCPU), @@ -187,9 +235,10 @@ const VMStateDescription vmstate_loongarch_cpu = { VMSTATE_UINT64(env.CSR_DBG, LoongArchCPU), VMSTATE_UINT64(env.CSR_DERA, LoongArchCPU), VMSTATE_UINT64(env.CSR_DSAVE, LoongArchCPU), - /* TLB */ - VMSTATE_STRUCT_ARRAY(env.tlb, LoongArchCPU, LOONGARCH_TLB_MAX, - 0, vmstate_tlb, LoongArchTLB), + + VMSTATE_UINT64(kvm_state_counter, LoongArchCPU), + /* PV steal time */ + VMSTATE_UINT64(env.stealtime.guest_addr, LoongArchCPU), VMSTATE_END_OF_LIST() }, @@ -197,6 +246,10 @@ const VMStateDescription vmstate_loongarch_cpu = { &vmstate_fpu, &vmstate_lsx, &vmstate_lasx, +#if defined(CONFIG_TCG) && !defined(CONFIG_USER_ONLY) + &vmstate_tlb, +#endif + &vmstate_lbt, NULL } }; diff --git a/target/loongarch/meson.build b/target/loongarch/meson.build index 18e8191e2b67ea369ea43796c471e3d7a89a9d01..7817318287d0f8c460218bbc63ea60614efae50c 100644 --- a/target/loongarch/meson.build +++ b/target/loongarch/meson.build @@ -3,31 +3,21 @@ gen = decodetree.process('insns.decode') loongarch_ss = ss.source_set() loongarch_ss.add(files( 'cpu.c', -)) -loongarch_tcg_ss = ss.source_set() -loongarch_tcg_ss.add(gen) -loongarch_tcg_ss.add(files( - 'fpu_helper.c', - 'op_helper.c', - 'translate.c', 'gdbstub.c', - 'vec_helper.c', )) -loongarch_tcg_ss.add(zlib) loongarch_system_ss = ss.source_set() loongarch_system_ss.add(files( + 'arch_dump.c', + 'cpu_helper.c', 'loongarch-qmp-cmds.c', 'machine.c', - 'tlb_helper.c', - 'constant_timer.c', - 'csr_helper.c', - 'iocsr_helper.c', )) common_ss.add(when: 'CONFIG_LOONGARCH_DIS', if_true: [files('disas.c'), gen]) -loongarch_ss.add_all(when: 'CONFIG_TCG', if_true: [loongarch_tcg_ss]) +subdir('tcg') target_arch += {'loongarch': loongarch_ss} target_system_arch += {'loongarch': loongarch_system_ss} +subdir('kvm') diff --git a/target/loongarch/constant_timer.c b/target/loongarch/tcg/constant_timer.c similarity index 100% rename from target/loongarch/constant_timer.c rename to target/loongarch/tcg/constant_timer.c diff --git a/target/loongarch/csr_helper.c b/target/loongarch/tcg/csr_helper.c similarity index 100% rename from target/loongarch/csr_helper.c rename to target/loongarch/tcg/csr_helper.c diff --git a/target/loongarch/fpu_helper.c b/target/loongarch/tcg/fpu_helper.c similarity index 100% rename from target/loongarch/fpu_helper.c rename to target/loongarch/tcg/fpu_helper.c diff --git a/target/loongarch/insn_trans/trans_arith.c.inc b/target/loongarch/tcg/insn_trans/trans_arith.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_arith.c.inc rename to target/loongarch/tcg/insn_trans/trans_arith.c.inc diff --git a/target/loongarch/insn_trans/trans_atomic.c.inc b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc similarity index 95% rename from target/loongarch/insn_trans/trans_atomic.c.inc rename to target/loongarch/tcg/insn_trans/trans_atomic.c.inc index 80c2e286fd889192ce11b4513a7d16df533d1dea..974bc2a70feddbf021a07b19a0859781eb3a11c4 100644 --- a/target/loongarch/insn_trans/trans_atomic.c.inc +++ b/target/loongarch/tcg/insn_trans/trans_atomic.c.inc @@ -5,14 +5,14 @@ static bool gen_ll(DisasContext *ctx, arg_rr_i *a, MemOp mop) { - TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE); + TCGv t1 = tcg_temp_new(); TCGv src1 = gpr_src(ctx, a->rj, EXT_NONE); TCGv t0 = make_address_i(ctx, src1, a->imm); - tcg_gen_qemu_ld_i64(dest, t0, ctx->mem_idx, mop); + tcg_gen_qemu_ld_i64(t1, t0, ctx->mem_idx, mop); tcg_gen_st_tl(t0, tcg_env, offsetof(CPULoongArchState, lladdr)); - tcg_gen_st_tl(dest, tcg_env, offsetof(CPULoongArchState, llval)); - gen_set_gpr(a->rd, dest, EXT_NONE); + tcg_gen_st_tl(t1, tcg_env, offsetof(CPULoongArchState, llval)); + gen_set_gpr(a->rd, t1, EXT_NONE); return true; } diff --git a/target/loongarch/insn_trans/trans_bit.c.inc b/target/loongarch/tcg/insn_trans/trans_bit.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_bit.c.inc rename to target/loongarch/tcg/insn_trans/trans_bit.c.inc diff --git a/target/loongarch/insn_trans/trans_branch.c.inc b/target/loongarch/tcg/insn_trans/trans_branch.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_branch.c.inc rename to target/loongarch/tcg/insn_trans/trans_branch.c.inc diff --git a/target/loongarch/insn_trans/trans_extra.c.inc b/target/loongarch/tcg/insn_trans/trans_extra.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_extra.c.inc rename to target/loongarch/tcg/insn_trans/trans_extra.c.inc diff --git a/target/loongarch/insn_trans/trans_farith.c.inc b/target/loongarch/tcg/insn_trans/trans_farith.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_farith.c.inc rename to target/loongarch/tcg/insn_trans/trans_farith.c.inc diff --git a/target/loongarch/insn_trans/trans_fcmp.c.inc b/target/loongarch/tcg/insn_trans/trans_fcmp.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fcmp.c.inc rename to target/loongarch/tcg/insn_trans/trans_fcmp.c.inc diff --git a/target/loongarch/insn_trans/trans_fcnv.c.inc b/target/loongarch/tcg/insn_trans/trans_fcnv.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fcnv.c.inc rename to target/loongarch/tcg/insn_trans/trans_fcnv.c.inc diff --git a/target/loongarch/insn_trans/trans_fmemory.c.inc b/target/loongarch/tcg/insn_trans/trans_fmemory.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fmemory.c.inc rename to target/loongarch/tcg/insn_trans/trans_fmemory.c.inc diff --git a/target/loongarch/insn_trans/trans_fmov.c.inc b/target/loongarch/tcg/insn_trans/trans_fmov.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_fmov.c.inc rename to target/loongarch/tcg/insn_trans/trans_fmov.c.inc diff --git a/target/loongarch/insn_trans/trans_memory.c.inc b/target/loongarch/tcg/insn_trans/trans_memory.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_memory.c.inc rename to target/loongarch/tcg/insn_trans/trans_memory.c.inc diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/tcg/insn_trans/trans_privileged.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_privileged.c.inc rename to target/loongarch/tcg/insn_trans/trans_privileged.c.inc diff --git a/target/loongarch/insn_trans/trans_shift.c.inc b/target/loongarch/tcg/insn_trans/trans_shift.c.inc similarity index 88% rename from target/loongarch/insn_trans/trans_shift.c.inc rename to target/loongarch/tcg/insn_trans/trans_shift.c.inc index 2f4bd6ff28819eb4c665a3775e6867f37294e6a7..377307785aab4837bc181f1691632e7970a9889d 100644 --- a/target/loongarch/insn_trans/trans_shift.c.inc +++ b/target/loongarch/tcg/insn_trans/trans_shift.c.inc @@ -67,19 +67,9 @@ static void gen_rotr_d(TCGv dest, TCGv src1, TCGv src2) tcg_gen_rotr_tl(dest, src1, t0); } -static bool trans_srai_w(DisasContext *ctx, arg_srai_w *a) +static void gen_sari_w(TCGv dest, TCGv src1, target_long imm) { - TCGv dest = gpr_dst(ctx, a->rd, EXT_NONE); - TCGv src1 = gpr_src(ctx, a->rj, EXT_ZERO); - - if (!avail_64(ctx)) { - return false; - } - - tcg_gen_sextract_tl(dest, src1, a->imm, 32 - a->imm); - gen_set_gpr(a->rd, dest, EXT_NONE); - - return true; + tcg_gen_sextract_tl(dest, src1, imm, 32 - imm); } TRANS(sll_w, ALL, gen_rrr, EXT_ZERO, EXT_NONE, EXT_SIGN, gen_sll_w) @@ -94,6 +84,7 @@ TRANS(slli_w, ALL, gen_rri_c, EXT_NONE, EXT_SIGN, tcg_gen_shli_tl) TRANS(slli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shli_tl) TRANS(srli_w, ALL, gen_rri_c, EXT_ZERO, EXT_SIGN, tcg_gen_shri_tl) TRANS(srli_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_shri_tl) +TRANS(srai_w, ALL, gen_rri_c, EXT_NONE, EXT_NONE, gen_sari_w) TRANS(srai_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_sari_tl) TRANS(rotri_w, 64, gen_rri_v, EXT_NONE, EXT_NONE, gen_rotr_w) TRANS(rotri_d, 64, gen_rri_c, EXT_NONE, EXT_NONE, tcg_gen_rotri_tl) diff --git a/target/loongarch/insn_trans/trans_vec.c.inc b/target/loongarch/tcg/insn_trans/trans_vec.c.inc similarity index 100% rename from target/loongarch/insn_trans/trans_vec.c.inc rename to target/loongarch/tcg/insn_trans/trans_vec.c.inc diff --git a/target/loongarch/iocsr_helper.c b/target/loongarch/tcg/iocsr_helper.c similarity index 76% rename from target/loongarch/iocsr_helper.c rename to target/loongarch/tcg/iocsr_helper.c index 6cd01d5f0940bede6de45f6ed35442e08770e937..b6916f53d20ca133f0000e773685cb94240bafe2 100644 --- a/target/loongarch/iocsr_helper.c +++ b/target/loongarch/tcg/iocsr_helper.c @@ -17,52 +17,52 @@ uint64_t helper_iocsrrd_b(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldub(&env->address_space_iocsr, r_addr, + return address_space_ldub(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_h(CPULoongArchState *env, target_ulong r_addr) { - return address_space_lduw(&env->address_space_iocsr, r_addr, + return address_space_lduw(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_w(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldl(&env->address_space_iocsr, r_addr, + return address_space_ldl(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } uint64_t helper_iocsrrd_d(CPULoongArchState *env, target_ulong r_addr) { - return address_space_ldq(&env->address_space_iocsr, r_addr, + return address_space_ldq(env->address_space_iocsr, r_addr, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_b(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stb(&env->address_space_iocsr, w_addr, + address_space_stb(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_h(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stw(&env->address_space_iocsr, w_addr, + address_space_stw(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_w(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stl(&env->address_space_iocsr, w_addr, + address_space_stl(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } void helper_iocsrwr_d(CPULoongArchState *env, target_ulong w_addr, target_ulong val) { - address_space_stq(&env->address_space_iocsr, w_addr, + address_space_stq(env->address_space_iocsr, w_addr, val, GET_MEMTXATTRS(env), NULL); } diff --git a/target/loongarch/tcg/meson.build b/target/loongarch/tcg/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..1a3cd589fb3c24d0513d004c8d3009d8489e7e57 --- /dev/null +++ b/target/loongarch/tcg/meson.build @@ -0,0 +1,19 @@ +if 'CONFIG_TCG' not in config_all + subdir_done() +endif + +loongarch_ss.add([zlib, gen]) + +loongarch_ss.add(files( + 'fpu_helper.c', + 'op_helper.c', + 'translate.c', + 'vec_helper.c', +)) + +loongarch_system_ss.add(files( + 'constant_timer.c', + 'csr_helper.c', + 'iocsr_helper.c', + 'tlb_helper.c', +)) diff --git a/target/loongarch/op_helper.c b/target/loongarch/tcg/op_helper.c similarity index 100% rename from target/loongarch/op_helper.c rename to target/loongarch/tcg/op_helper.c diff --git a/target/loongarch/tlb_helper.c b/target/loongarch/tcg/tlb_helper.c similarity index 67% rename from target/loongarch/tlb_helper.c rename to target/loongarch/tcg/tlb_helper.c index 449043c68be02cc2280f194419462360a3baa045..eedd1ac376265cb8ba08e74ca4de089ad970bb6a 100644 --- a/target/loongarch/tlb_helper.c +++ b/target/loongarch/tcg/tlb_helper.c @@ -17,234 +17,32 @@ #include "exec/log.h" #include "cpu-csr.h" -enum { - TLBRET_MATCH = 0, - TLBRET_BADADDR = 1, - TLBRET_NOMATCH = 2, - TLBRET_INVALID = 3, - TLBRET_DIRTY = 4, - TLBRET_RI = 5, - TLBRET_XI = 6, - TLBRET_PE = 7, -}; - -static int loongarch_map_tlb_entry(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - int access_type, int index, int mmu_idx) +static void get_dir_base_width(CPULoongArchState *env, uint64_t *dir_base, + uint64_t *dir_width, target_ulong level) { - LoongArchTLB *tlb = &env->tlb[index]; - uint64_t plv = mmu_idx; - uint64_t tlb_entry, tlb_ppn; - uint8_t tlb_ps, n, tlb_v, tlb_d, tlb_plv, tlb_nx, tlb_nr, tlb_rplv; - - if (index >= LOONGARCH_STLB) { - tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); - } else { - tlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); - } - n = (address >> tlb_ps) & 0x1;/* Odd or even */ - - tlb_entry = n ? tlb->tlb_entry1 : tlb->tlb_entry0; - tlb_v = FIELD_EX64(tlb_entry, TLBENTRY, V); - tlb_d = FIELD_EX64(tlb_entry, TLBENTRY, D); - tlb_plv = FIELD_EX64(tlb_entry, TLBENTRY, PLV); - if (is_la64(env)) { - tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_64, PPN); - tlb_nx = FIELD_EX64(tlb_entry, TLBENTRY_64, NX); - tlb_nr = FIELD_EX64(tlb_entry, TLBENTRY_64, NR); - tlb_rplv = FIELD_EX64(tlb_entry, TLBENTRY_64, RPLV); - } else { - tlb_ppn = FIELD_EX64(tlb_entry, TLBENTRY_32, PPN); - tlb_nx = 0; - tlb_nr = 0; - tlb_rplv = 0; - } - - /* Remove sw bit between bit12 -- bit PS*/ - tlb_ppn = tlb_ppn & ~(((0x1UL << (tlb_ps - 12)) -1)); - - /* Check access rights */ - if (!tlb_v) { - return TLBRET_INVALID; - } - - if (access_type == MMU_INST_FETCH && tlb_nx) { - return TLBRET_XI; - } - - if (access_type == MMU_DATA_LOAD && tlb_nr) { - return TLBRET_RI; - } - - if (((tlb_rplv == 0) && (plv > tlb_plv)) || - ((tlb_rplv == 1) && (plv != tlb_plv))) { - return TLBRET_PE; - } - - if ((access_type == MMU_DATA_STORE) && !tlb_d) { - return TLBRET_DIRTY; - } - - *physical = (tlb_ppn << R_TLBENTRY_64_PPN_SHIFT) | - (address & MAKE_64BIT_MASK(0, tlb_ps)); - *prot = PAGE_READ; - if (tlb_d) { - *prot |= PAGE_WRITE; - } - if (!tlb_nx) { - *prot |= PAGE_EXEC; - } - return TLBRET_MATCH; -} - -/* - * One tlb entry holds an adjacent odd/even pair, the vpn is the - * content of the virtual page number divided by 2. So the - * compare vpn is bit[47:15] for 16KiB page. while the vppn - * field in tlb entry contains bit[47:13], so need adjust. - * virt_vpn = vaddr[47:13] - */ -static bool loongarch_tlb_search(CPULoongArchState *env, target_ulong vaddr, - int *index) -{ - LoongArchTLB *tlb; - uint16_t csr_asid, tlb_asid, stlb_idx; - uint8_t tlb_e, tlb_ps, tlb_g, stlb_ps; - int i, compare_shift; - uint64_t vpn, tlb_vppn; - - csr_asid = FIELD_EX64(env->CSR_ASID, CSR_ASID, ASID); - stlb_ps = FIELD_EX64(env->CSR_STLBPS, CSR_STLBPS, PS); - vpn = (vaddr & TARGET_VIRT_MASK) >> (stlb_ps + 1); - stlb_idx = vpn & 0xff; /* VA[25:15] <==> TLBIDX.index for 16KiB Page */ - compare_shift = stlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; - - /* Search STLB */ - for (i = 0; i < 8; ++i) { - tlb = &env->tlb[i * 256 + stlb_idx]; - tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); - if (tlb_e) { - tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); - tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); - tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); - - if ((tlb_g == 1 || tlb_asid == csr_asid) && - (vpn == (tlb_vppn >> compare_shift))) { - *index = i * 256 + stlb_idx; - return true; - } - } - } - - /* Search MTLB */ - for (i = LOONGARCH_STLB; i < LOONGARCH_TLB_MAX; ++i) { - tlb = &env->tlb[i]; - tlb_e = FIELD_EX64(tlb->tlb_misc, TLB_MISC, E); - if (tlb_e) { - tlb_vppn = FIELD_EX64(tlb->tlb_misc, TLB_MISC, VPPN); - tlb_ps = FIELD_EX64(tlb->tlb_misc, TLB_MISC, PS); - tlb_asid = FIELD_EX64(tlb->tlb_misc, TLB_MISC, ASID); - tlb_g = FIELD_EX64(tlb->tlb_entry0, TLBENTRY, G); - compare_shift = tlb_ps + 1 - R_TLB_MISC_VPPN_SHIFT; - vpn = (vaddr & TARGET_VIRT_MASK) >> (tlb_ps + 1); - if ((tlb_g == 1 || tlb_asid == csr_asid) && - (vpn == (tlb_vppn >> compare_shift))) { - *index = i; - return true; - } - } - } - return false; -} - -static int loongarch_map_address(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - MMUAccessType access_type, int mmu_idx) -{ - int index, match; - - match = loongarch_tlb_search(env, address, &index); - if (match) { - return loongarch_map_tlb_entry(env, physical, prot, - address, access_type, index, mmu_idx); - } - - return TLBRET_NOMATCH; -} - -static hwaddr dmw_va2pa(CPULoongArchState *env, target_ulong va, - target_ulong dmw) -{ - if (is_la64(env)) { - return va & TARGET_VIRT_MASK; - } else { - uint32_t pseg = FIELD_EX32(dmw, CSR_DMW_32, PSEG); - return (va & MAKE_64BIT_MASK(0, R_CSR_DMW_32_VSEG_SHIFT)) | \ - (pseg << R_CSR_DMW_32_VSEG_SHIFT); - } -} - -static int get_physical_address(CPULoongArchState *env, hwaddr *physical, - int *prot, target_ulong address, - MMUAccessType access_type, int mmu_idx) -{ - int user_mode = mmu_idx == MMU_IDX_USER; - int kernel_mode = mmu_idx == MMU_IDX_KERNEL; - uint32_t plv, base_c, base_v; - int64_t addr_high; - uint8_t da = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, DA); - uint8_t pg = FIELD_EX64(env->CSR_CRMD, CSR_CRMD, PG); - - /* Check PG and DA */ - if (da & !pg) { - *physical = address & TARGET_PHYS_MASK; - *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - return TLBRET_MATCH; - } - - plv = kernel_mode | (user_mode << R_CSR_DMW_PLV3_SHIFT); - if (is_la64(env)) { - base_v = address >> R_CSR_DMW_64_VSEG_SHIFT; - } else { - base_v = address >> R_CSR_DMW_32_VSEG_SHIFT; - } - /* Check direct map window */ - for (int i = 0; i < 4; i++) { - if (is_la64(env)) { - base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_64, VSEG); - } else { - base_c = FIELD_EX64(env->CSR_DMW[i], CSR_DMW_32, VSEG); - } - if ((plv & env->CSR_DMW[i]) && (base_c == base_v)) { - *physical = dmw_va2pa(env, address, env->CSR_DMW[i]); - *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - return TLBRET_MATCH; - } - } - - /* Check valid extension */ - addr_high = sextract64(address, TARGET_VIRT_ADDR_SPACE_BITS, 16); - if (!(addr_high == 0 || addr_high == -1)) { - return TLBRET_BADADDR; - } - - /* Mapped address */ - return loongarch_map_address(env, physical, prot, address, - access_type, mmu_idx); -} - -hwaddr loongarch_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) -{ - LoongArchCPU *cpu = LOONGARCH_CPU(cs); - CPULoongArchState *env = &cpu->env; - hwaddr phys_addr; - int prot; - - if (get_physical_address(env, &phys_addr, &prot, addr, MMU_DATA_LOAD, - cpu_mmu_index(env, false)) != 0) { - return -1; + switch (level) { + case 1: + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH); + break; + case 2: + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH); + break; + case 3: + *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH); + break; + case 4: + *dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE); + *dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH); + break; + default: + /* level may be zero for ldpte */ + *dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE); + *dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH); + break; } - return phys_addr; } static void raise_mmu_exception(CPULoongArchState *env, target_ulong address, @@ -716,7 +514,25 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base, target_ulong badvaddr, index, phys, ret; int shift; uint64_t dir_base, dir_width; - bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1; + + if (unlikely((level == 0) || (level > 4))) { + qemu_log_mask(LOG_GUEST_ERROR, + "Attepted LDDIR with level %"PRId64"\n", level); + return base; + } + + if (FIELD_EX64(base, TLBENTRY, HUGE)) { + if (unlikely(level == 4)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Attempted use of level 4 huge page\n"); + } + + if (FIELD_EX64(base, TLBENTRY, LEVEL)) { + return base; + } else { + return FIELD_DP64(base, TLBENTRY, LEVEL, level); + } + } badvaddr = env->CSR_TLBRBADV; base = base & TARGET_PHYS_MASK; @@ -725,30 +541,7 @@ target_ulong helper_lddir(CPULoongArchState *env, target_ulong base, shift = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTEWIDTH); shift = (shift + 1) * 3; - if (huge) { - return base; - } - switch (level) { - case 1: - dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_BASE); - dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR1_WIDTH); - break; - case 2: - dir_base = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_BASE); - dir_width = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, DIR2_WIDTH); - break; - case 3: - dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_BASE); - dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR3_WIDTH); - break; - case 4: - dir_base = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_BASE); - dir_width = FIELD_EX64(env->CSR_PWCH, CSR_PWCH, DIR4_WIDTH); - break; - default: - do_raise_exception(env, EXCCODE_INE, GETPC()); - return 0; - } + get_dir_base_width(env, &dir_base, &dir_width, level); index = (badvaddr >> dir_base) & ((1 << dir_width) - 1); phys = base | index << shift; ret = ldq_phys(cs->as, phys) & TARGET_PHYS_MASK; @@ -761,20 +554,42 @@ void helper_ldpte(CPULoongArchState *env, target_ulong base, target_ulong odd, CPUState *cs = env_cpu(env); target_ulong phys, tmp0, ptindex, ptoffset0, ptoffset1, ps, badv; int shift; - bool huge = (base >> LOONGARCH_PAGE_HUGE_SHIFT) & 0x1; uint64_t ptbase = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTBASE); uint64_t ptwidth = FIELD_EX64(env->CSR_PWCL, CSR_PWCL, PTWIDTH); + uint64_t dir_base, dir_width; + /* + * The parameter "base" has only two types, + * one is the page table base address, + * whose bit 6 should be 0, + * and the other is the huge page entry, + * whose bit 6 should be 1. + */ base = base & TARGET_PHYS_MASK; + if (FIELD_EX64(base, TLBENTRY, HUGE)) { + /* + * Gets the huge page level and Gets huge page size. + * Clears the huge page level information in the entry. + * Clears huge page bit. + * Move HGLOBAL bit to GLOBAL bit. + */ + get_dir_base_width(env, &dir_base, &dir_width, + FIELD_EX64(base, TLBENTRY, LEVEL)); + + base = FIELD_DP64(base, TLBENTRY, LEVEL, 0); + base = FIELD_DP64(base, TLBENTRY, HUGE, 0); + if (FIELD_EX64(base, TLBENTRY, HGLOBAL)) { + base = FIELD_DP64(base, TLBENTRY, HGLOBAL, 0); + base = FIELD_DP64(base, TLBENTRY, G, 1); + } - if (huge) { - /* Huge Page. base is paddr */ - tmp0 = base ^ (1 << LOONGARCH_PAGE_HUGE_SHIFT); - /* Move Global bit */ - tmp0 = ((tmp0 & (1 << LOONGARCH_HGLOBAL_SHIFT)) >> - LOONGARCH_HGLOBAL_SHIFT) << R_TLBENTRY_G_SHIFT | - (tmp0 & (~(1 << LOONGARCH_HGLOBAL_SHIFT))); - ps = ptbase + ptwidth - 1; + ps = dir_base + dir_width - 1; + /* + * Huge pages are evenly split into parity pages + * when loaded into the tlb, + * so the tlb page size needs to be divided by 2. + */ + tmp0 = base; if (odd) { tmp0 += MAKE_64BIT_MASK(ps, 1); } diff --git a/target/loongarch/translate.c b/target/loongarch/tcg/translate.c similarity index 100% rename from target/loongarch/translate.c rename to target/loongarch/tcg/translate.c diff --git a/target/loongarch/vec_helper.c b/target/loongarch/tcg/vec_helper.c similarity index 100% rename from target/loongarch/vec_helper.c rename to target/loongarch/tcg/vec_helper.c diff --git a/target/loongarch/trace-events b/target/loongarch/trace-events new file mode 100644 index 0000000000000000000000000000000000000000..dea11edc0f1164318642cbed4306ca23a29ed3a8 --- /dev/null +++ b/target/loongarch/trace-events @@ -0,0 +1,15 @@ +# See docs/devel/tracing.rst for syntax documentation. + +#kvm.c +kvm_failed_get_regs_core(const char *msg) "Failed to get core regs from KVM: %s" +kvm_failed_put_regs_core(const char *msg) "Failed to put core regs into KVM: %s" +kvm_failed_get_fpu(const char *msg) "Failed to get fpu from KVM: %s" +kvm_failed_put_fpu(const char *msg) "Failed to put fpu into KVM: %s" +kvm_failed_get_mpstate(const char *msg) "Failed to get mp_state from KVM: %s" +kvm_failed_put_mpstate(const char *msg) "Failed to put mp_state into KVM: %s" +kvm_failed_get_counter(const char *msg) "Failed to get counter from KVM: %s" +kvm_failed_put_counter(const char *msg) "Failed to put counter into KVM: %s" +kvm_failed_get_cpucfg(const char *msg) "Failed to get cpucfg from KVM: %s" +kvm_failed_put_cpucfg(const char *msg) "Failed to put cpucfg into KVM: %s" +kvm_arch_handle_exit(int num) "kvm arch handle exit, the reason number: %d" +kvm_set_intr(int irq, int level) "kvm set interrupt, irq num: %d, level: %d" diff --git a/target/loongarch/trace.h b/target/loongarch/trace.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ecb78f0843e24884d8279aec544ecc0f07fec4 --- /dev/null +++ b/target/loongarch/trace.h @@ -0,0 +1 @@ +#include "trace/trace-target_loongarch.h" diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 11c7e0a790203ed71662e85d6cdb312cc46af3e2..d95deaafcd31d05f164a0671f9d6e28c8e6e2c9d 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -396,12 +396,19 @@ static const VMStateDescription vmstate_freg = { } }; -static int fpu_post_load(void *opaque, int version) +static int fpu_pre_save(void *opaque) { M68kCPU *s = opaque; - cpu_m68k_restore_fp_status(&s->env); + s->env.fpsr = cpu_m68k_get_fpsr(&s->env); + return 0; +} + +static int fpu_post_load(void *opaque, int version) +{ + M68kCPU *s = opaque; + cpu_m68k_set_fpsr(&s->env, s->env.fpsr); return 0; } @@ -410,6 +417,7 @@ const VMStateDescription vmmstate_fpu = { .version_id = 1, .minimum_version_id = 1, .needed = fpu_needed, + .pre_save = fpu_pre_save, .post_load = fpu_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(env.fpcr, M68kCPU), diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h index 6cfc696d2ba81e06c367b70678eb8585931e8d20..4d78da9d5fc3135ba0d0f95743a8782ae314e18a 100644 --- a/target/m68k/cpu.h +++ b/target/m68k/cpu.h @@ -199,7 +199,8 @@ void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t); void cpu_m68k_set_sr(CPUM68KState *env, uint32_t); void cpu_m68k_restore_fp_status(CPUM68KState *env); void cpu_m68k_set_fpcr(CPUM68KState *env, uint32_t val); - +uint32_t cpu_m68k_get_fpsr(CPUM68KState *env); +void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val); /* * Instead of computing the condition codes after each m68k instruction, diff --git a/target/m68k/fpu_helper.c b/target/m68k/fpu_helper.c index ab120b5f59b1f349af12a781fa10539d82769fc6..8314791f50428a5d32d6d1c81e8755be43f37ada 100644 --- a/target/m68k/fpu_helper.c +++ b/target/m68k/fpu_helper.c @@ -164,6 +164,78 @@ void HELPER(set_fpcr)(CPUM68KState *env, uint32_t val) cpu_m68k_set_fpcr(env, val); } +/* Convert host exception flags to cpu_m68k form. */ +static int cpu_m68k_exceptbits_from_host(int host_bits) +{ + int target_bits = 0; + + if (host_bits & float_flag_invalid) { + target_bits |= 0x80; + } + if (host_bits & float_flag_overflow) { + target_bits |= 0x40; + } + if (host_bits & (float_flag_underflow | float_flag_output_denormal)) { + target_bits |= 0x20; + } + if (host_bits & float_flag_divbyzero) { + target_bits |= 0x10; + } + if (host_bits & float_flag_inexact) { + target_bits |= 0x08; + } + return target_bits; +} + +/* Convert cpu_m68k exception flags to target form. */ +static int cpu_m68k_exceptbits_to_host(int target_bits) +{ + int host_bits = 0; + + if (target_bits & 0x80) { + host_bits |= float_flag_invalid; + } + if (target_bits & 0x40) { + host_bits |= float_flag_overflow; + } + if (target_bits & 0x20) { + host_bits |= float_flag_underflow; + } + if (target_bits & 0x10) { + host_bits |= float_flag_divbyzero; + } + if (target_bits & 0x08) { + host_bits |= float_flag_inexact; + } + return host_bits; +} + +uint32_t cpu_m68k_get_fpsr(CPUM68KState *env) +{ + int host_flags = get_float_exception_flags(&env->fp_status); + int target_flags = cpu_m68k_exceptbits_from_host(host_flags); + int except = (env->fpsr & ~(0xf8)) | target_flags; + return except; +} + +uint32_t HELPER(get_fpsr)(CPUM68KState *env) +{ + return cpu_m68k_get_fpsr(env); +} + +void cpu_m68k_set_fpsr(CPUM68KState *env, uint32_t val) +{ + env->fpsr = val; + + int host_flags = cpu_m68k_exceptbits_to_host((int) env->fpsr); + set_float_exception_flags(host_flags, &env->fp_status); +} + +void HELPER(set_fpsr)(CPUM68KState *env, uint32_t val) +{ + cpu_m68k_set_fpsr(env, val); +} + #define PREC_BEGIN(prec) \ do { \ FloatX80RoundPrec old = \ diff --git a/target/m68k/helper.c b/target/m68k/helper.c index 0a1544cd68d9dc7cff8983283d07c83d773c32d6..beab4b96bc3be62372a17c7c44e4291586f8e504 100644 --- a/target/m68k/helper.c +++ b/target/m68k/helper.c @@ -118,7 +118,7 @@ static int m68k_fpu_gdb_get_reg(CPUM68KState *env, GByteArray *mem_buf, int n) case 8: /* fpcontrol */ return gdb_get_reg32(mem_buf, env->fpcr); case 9: /* fpstatus */ - return gdb_get_reg32(mem_buf, env->fpsr); + return gdb_get_reg32(mem_buf, cpu_m68k_get_fpsr(env)); case 10: /* fpiar, not implemented */ return gdb_get_reg32(mem_buf, 0); } @@ -137,7 +137,7 @@ static int m68k_fpu_gdb_set_reg(CPUM68KState *env, uint8_t *mem_buf, int n) cpu_m68k_set_fpcr(env, ldl_p(mem_buf)); return 4; case 9: /* fpstatus */ - env->fpsr = ldl_p(mem_buf); + cpu_m68k_set_fpsr(env, ldl_p(mem_buf)); return 4; case 10: /* fpiar, not implemented */ return 4; diff --git a/target/m68k/helper.h b/target/m68k/helper.h index 2bbe0dc03253d323078994f8a102f825b386a365..95aa5e53bb97b38084588d52cc90222b5c141851 100644 --- a/target/m68k/helper.h +++ b/target/m68k/helper.h @@ -54,6 +54,8 @@ DEF_HELPER_4(fsdiv, void, env, fp, fp, fp) DEF_HELPER_4(fddiv, void, env, fp, fp, fp) DEF_HELPER_4(fsgldiv, void, env, fp, fp, fp) DEF_HELPER_FLAGS_3(fcmp, TCG_CALL_NO_RWG, void, env, fp, fp) +DEF_HELPER_2(set_fpsr, void, env, i32) +DEF_HELPER_1(get_fpsr, i32, env) DEF_HELPER_FLAGS_2(set_fpcr, TCG_CALL_NO_RWG, void, env, i32) DEF_HELPER_FLAGS_2(ftst, TCG_CALL_NO_RWG, void, env, fp) DEF_HELPER_3(fconst, void, env, fp, i32) diff --git a/target/m68k/translate.c b/target/m68k/translate.c index 4a0b0b270364f2d91e7cdb91ecfc167d32ef4308..f8eeb70379853723f932978e0088a839357a7936 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -4686,7 +4686,7 @@ static void gen_load_fcr(DisasContext *s, TCGv res, int reg) tcg_gen_movi_i32(res, 0); break; case M68K_FPSR: - tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpsr)); + gen_helper_get_fpsr(res, tcg_env); break; case M68K_FPCR: tcg_gen_ld_i32(res, tcg_env, offsetof(CPUM68KState, fpcr)); @@ -4700,7 +4700,7 @@ static void gen_store_fcr(DisasContext *s, TCGv val, int reg) case M68K_FPIAR: break; case M68K_FPSR: - tcg_gen_st_i32(val, tcg_env, offsetof(CPUM68KState, fpsr)); + gen_helper_set_fpsr(tcg_env, val); break; case M68K_FPCR: gen_helper_set_fpcr(tcg_env, val); diff --git a/target/ppc/machine.c b/target/ppc/machine.c index 68cbdffecd4f5042464473a5eb00f2326fab1c84..3e010f3a074b2af33e7e2aacdf41de22094d1ecf 100644 --- a/target/ppc/machine.c +++ b/target/ppc/machine.c @@ -621,7 +621,7 @@ static bool tlbemb_needed(void *opaque) } static const VMStateDescription vmstate_tlbemb = { - .name = "cpu/tlb6xx", + .name = "cpu/tlbemb", .version_id = 1, .minimum_version_id = 1, .needed = tlbemb_needed, diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc index 6db87ab3367f9fa6d427fe60812c8f1deb19ba64..986453d35f0a8028ca2b31668bee3070ed83cfc4 100644 --- a/target/ppc/translate/vsx-impl.c.inc +++ b/target/ppc/translate/vsx-impl.c.inc @@ -2268,7 +2268,7 @@ static bool do_lstxv(DisasContext *ctx, int ra, TCGv displ, static bool do_lstxv_D(DisasContext *ctx, arg_D *a, bool store, bool paired) { - if (paired || a->rt >= 32) { + if (paired || a->rt < 32) { REQUIRE_VSX(ctx); } else { REQUIRE_VECTOR(ctx); @@ -2292,7 +2292,7 @@ static bool do_lstxv_PLS_D(DisasContext *ctx, arg_PLS_D *a, static bool do_lstxv_X(DisasContext *ctx, arg_X *a, bool store, bool paired) { - if (paired || a->rt >= 32) { + if (paired || a->rt < 32) { REQUIRE_VSX(ctx); } else { REQUIRE_VECTOR(ctx); diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 83c7c0cf07be15fe68ec512de670cccf9de56949..77cb59b8a17e4a71b18d600446b36e376d7ef091 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -1359,7 +1359,7 @@ const RISCVCPUMultiExtConfig riscv_cpu_extensions[] = { /* Vector cryptography extensions */ MULTI_EXT_CFG_BOOL("zvbb", ext_zvbb, false), MULTI_EXT_CFG_BOOL("zvbc", ext_zvbc, false), - MULTI_EXT_CFG_BOOL("zvkb", ext_zvkg, false), + MULTI_EXT_CFG_BOOL("zvkb", ext_zvkb, false), MULTI_EXT_CFG_BOOL("zvkg", ext_zvkg, false), MULTI_EXT_CFG_BOOL("zvkned", ext_zvkned, false), MULTI_EXT_CFG_BOOL("zvknha", ext_zvknha, false), diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index e7e23b34f455d4b9d01acdfe4808325c86d2169a..4d8f1248dd08f3f6e5a44b121b45450acf1d8dd2 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -1644,10 +1644,10 @@ void riscv_cpu_do_interrupt(CPUState *cs) bool async = !!(cs->exception_index & RISCV_EXCP_INT_FLAG); target_ulong cause = cs->exception_index & RISCV_EXCP_INT_MASK; uint64_t deleg = async ? env->mideleg : env->medeleg; - bool s_injected = env->mvip & (1 << cause) & env->mvien && - !(env->mip & (1 << cause)); - bool vs_injected = env->hvip & (1 << cause) & env->hvien && - !(env->mip & (1 << cause)); + bool s_injected = env->mvip & (1ULL << cause) & env->mvien && + !(env->mip & (1ULL << cause)); + bool vs_injected = env->hvip & (1ULL << cause) & env->hvien && + !(env->mip & (1ULL << cause)); target_ulong tval = 0; target_ulong tinst = 0; target_ulong htval = 0; diff --git a/target/riscv/csr.c b/target/riscv/csr.c index fde7ce1a5336b5d744c6676edadfe64b1688a2df..d1bb7bc0d3df7e2da52bba2ee90ab45c3e171dbd 100644 --- a/target/riscv/csr.c +++ b/target/riscv/csr.c @@ -704,7 +704,7 @@ static RISCVException write_vxrm(CPURISCVState *env, int csrno, static RISCVException read_vxsat(CPURISCVState *env, int csrno, target_ulong *val) { - *val = env->vxsat; + *val = env->vxsat & BIT(0); return RISCV_EXCP_NONE; } @@ -714,7 +714,7 @@ static RISCVException write_vxsat(CPURISCVState *env, int csrno, #if !defined(CONFIG_USER_ONLY) env->mstatus |= MSTATUS_VS; #endif - env->vxsat = val; + env->vxsat = val & BIT(0); return RISCV_EXCP_NONE; } diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c index 45b6cf1cfa04a87dd9526c8b017ecca78a956e4f..b3dc2070f9e1c16808978ee8580e1aa7bb748477 100644 --- a/target/riscv/kvm/kvm-cpu.c +++ b/target/riscv/kvm/kvm-cpu.c @@ -369,10 +369,14 @@ static void kvm_riscv_update_cpu_cfg_isa_ext(RISCVCPU *cpu, CPUState *cs) reg = kvm_cpu_cfg_get(cpu, multi_ext_cfg); ret = kvm_set_one_reg(cs, id, ®); if (ret != 0) { - error_report("Unable to %s extension %s in KVM, error %d", - reg ? "enable" : "disable", - multi_ext_cfg->name, ret); - exit(EXIT_FAILURE); + if (!reg && ret == -EINVAL) { + warn_report("KVM cannot disable extension %s", + multi_ext_cfg->name); + } else { + error_report("Unable to enable extension %s in KVM, error %d", + multi_ext_cfg->name, ret); + exit(EXIT_FAILURE); + } } } } diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index c1c3a4d1eabc706e3a58eaa4227f076ba8e59593..0602b623e0fe6664c7cfc5db071db28091ef84e2 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -196,7 +196,7 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, uint32_t esz = 1 << log2_esz; uint32_t vma = vext_vma(desc); - for (i = env->vstart; i < env->vl; i++, env->vstart++) { + for (i = env->vstart; i < env->vl; env->vstart = ++i) { k = 0; while (k < nf) { if (!vm && !vext_elem_mask(v0, i)) { @@ -262,7 +262,7 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, uint32_t esz = 1 << log2_esz; /* load bytes from guest memory */ - for (i = env->vstart; i < evl; i++, env->vstart++) { + for (i = env->vstart; i < evl; env->vstart = ++i) { k = 0; while (k < nf) { target_ulong addr = base + ((i * nf + k) << log2_esz); @@ -376,7 +376,7 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, uint32_t vma = vext_vma(desc); /* load bytes from guest memory */ - for (i = env->vstart; i < env->vl; i++, env->vstart++) { + for (i = env->vstart; i < env->vl; env->vstart = ++i) { k = 0; while (k < nf) { if (!vm && !vext_elem_mask(v0, i)) { @@ -4770,6 +4770,7 @@ void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ } \ *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \ } \ + env->vstart = 0; \ /* set tail elements to 1s */ \ vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ } @@ -5045,7 +5046,7 @@ void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ } \ env->vstart = 0; \ /* set tail elements to 1s */ \ - vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ + vext_set_elems_1s(vd, vta, num * esz, total_elems * esz); \ } /* Compress into vd elements of vs2 where vs1 is enabled */ @@ -5063,9 +5064,17 @@ void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) uint32_t startb = env->vstart * sewb; uint32_t i = startb; + if (HOST_BIG_ENDIAN && i % 8 != 0) { + uint32_t j = ROUND_UP(i, 8); + memcpy((uint8_t *)vd + H1(j - 1), + (uint8_t *)vs2 + H1(j - 1), + j - i); + i = j; + } + memcpy((uint8_t *)vd + H1(i), (uint8_t *)vs2 + H1(i), - maxsz - startb); + maxsz - i); env->vstart = 0; } diff --git a/target/riscv/vector_internals.c b/target/riscv/vector_internals.c index 9cf5c17cdeacad77952935a99d9e2021f813047e..be6eb040d223bc01372f31e55584a85698b170a9 100644 --- a/target/riscv/vector_internals.c +++ b/target/riscv/vector_internals.c @@ -29,6 +29,28 @@ void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt, if (tot - cnt == 0) { return ; } + + if (HOST_BIG_ENDIAN) { + /* + * Deal the situation when the elements are insdie + * only one uint64 block including setting the + * masked-off element. + */ + if (((tot - 1) ^ cnt) < 8) { + memset(base + H1(tot - 1), -1, tot - cnt); + return; + } + /* + * Otherwise, at least cross two uint64_t blocks. + * Set first unaligned block. + */ + if (cnt % 8 != 0) { + uint32_t j = ROUND_UP(cnt, 8); + memset(base + H1(j - 1), -1, j - cnt); + cnt = j; + } + /* Set other 64bit aligend blocks */ + } memset(base + cnt, -1, tot - cnt); } diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 6acfa1c91b20451222abbacfbfec7d0bfae3a35c..5e64f24cc2e576bdb15fc7bca8ce99604927b9cc 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -350,7 +350,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data) device_class_set_parent_reset(dc, s390_cpu_reset_full, &scc->parent_reset); scc->reset = s390_cpu_reset; - cc->class_by_name = s390_cpu_class_by_name, + cc->class_by_name = s390_cpu_class_by_name; cc->has_work = s390_cpu_has_work; cc->dump_state = s390_cpu_dump_state; cc->query_cpu_fast = s390_query_cpu_fast; diff --git a/target/sparc/helper.c b/target/sparc/helper.c index bd10b60e4bfb2bdb42dec24ea30045a62621f297..8820c59e7cbbace98972a63e2e8017dbdb2bad2e 100644 --- a/target/sparc/helper.c +++ b/target/sparc/helper.c @@ -121,7 +121,7 @@ uint64_t helper_sdiv(CPUSPARCState *env, target_ulong a, target_ulong b) return (uint32_t)(b32 < 0 ? INT32_MAX : INT32_MIN) | (-1ull << 32); } - a64 /= b; + a64 /= b32; r = a64; if (unlikely(r != a64)) { return (uint32_t)(a64 < 0 ? INT32_MIN : INT32_MAX) | (-1ull << 32); diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index bab0a173a37378535bfe8434a32201cdcbbc6b65..ad2690b90d22f36896ba5abd701ec7bfee83e3b4 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -365,8 +365,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, * back to the slow path. */ - intptr_t pc_offset; - tcg_target_long val_lo, val_hi, pc_hi, offset_hi; + intptr_t src_rx, pc_offset; tcg_target_long hi12, hi32, hi52; /* Value fits in signed i32. */ @@ -376,24 +375,23 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, } /* PC-relative cases. */ - pc_offset = tcg_pcrel_diff(s, (void *)val); - if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) { - /* Single pcaddu2i. */ - tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2); - return; + src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr); + if ((val & 3) == 0) { + pc_offset = val - src_rx; + if (pc_offset == sextreg(pc_offset, 0, 22)) { + /* Single pcaddu2i. */ + tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2); + return; + } } - if (pc_offset == (int32_t)pc_offset) { - /* Offset within 32 bits; load with pcalau12i + ori. */ - val_lo = sextreg(val, 0, 12); - val_hi = val >> 12; - pc_hi = (val - pc_offset) >> 12; - offset_hi = val_hi - pc_hi; - - tcg_debug_assert(offset_hi == sextreg(offset_hi, 0, 20)); - tcg_out_opc_pcalau12i(s, rd, offset_hi); + pc_offset = (val >> 12) - (src_rx >> 12); + if (pc_offset == sextreg(pc_offset, 0, 20)) { + /* Load with pcalau12i + ori. */ + tcg_target_long val_lo = val & 0xfff; + tcg_out_opc_pcalau12i(s, rd, pc_offset); if (val_lo != 0) { - tcg_out_opc_ori(s, rd, rd, val_lo & 0xfff); + tcg_out_opc_ori(s, rd, rd, val_lo); } return; } diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index bb88943f79cfb635f574a1a7cd390c4301a6b3e2..733b44f105f2f2eb22b0a2153a618060c3190911 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -88,7 +88,20 @@ uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) uint32_t desc = 0; check_size_align(oprsz, maxsz, 0); - tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); + + /* + * We want to check that 'data' will fit into SIMD_DATA_BITS. + * However, some callers want to treat the data as a signed + * value (which they can later get back with simd_data()) + * and some want to treat it as an unsigned value. + * So here we assert only that the data will fit into the + * field in at least one way. This means that some invalid + * values from the caller will not be detected, e.g. if the + * caller wants to handle the value as a signed integer but + * incorrectly passes us 1 << (SIMD_DATA_BITS - 1). + */ + tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) || + data == extract32(data, 0, SIMD_DATA_BITS)); oprsz = (oprsz / 8) - 1; maxsz = (maxsz / 8) - 1; diff --git a/tcg/tcg.c b/tcg/tcg.c index 896a36caeba6ce0c480106a77e118c6a05417216..61fcf8597d1fbef96099514e5ec85b210acecbc0 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -764,6 +764,14 @@ static void alloc_tcg_plugin_context(TCGContext *s) #endif } +static void free_tcg_plugin_context(TCGContext *s) +{ +#ifdef CONFIG_PLUGIN + g_ptr_array_unref(s->plugin_tb->insns); + g_free(s->plugin_tb); +#endif +} + /* * All TCG threads except the parent (i.e. the one that called tcg_context_init * and registered the target's TCG globals) must register with this function @@ -814,6 +822,21 @@ void tcg_register_thread(void) tcg_ctx = s; } + +void tcg_unregister_thread(void) +{ + TCGContext *s = tcg_ctx; + unsigned int n; + + /* Unclaim an entry in tcg_ctxs */ + n = qatomic_fetch_dec(&tcg_cur_ctxs); + g_assert(n > 1); + qatomic_store_release(&tcg_ctxs[n - 1], 0); + + free_tcg_plugin_context(s); + + g_free(s); +} #endif /* !CONFIG_USER_ONLY */ /* pool based memory allocation */ diff --git a/tests/avocado/acpi-bits.py b/tests/avocado/acpi-bits.py index 68b9e98d4ea9d4092905e0ca9eb98aae082c0d3b..efe4f52ee05c17800f9be3958c60ac58691df0cc 100644 --- a/tests/avocado/acpi-bits.py +++ b/tests/avocado/acpi-bits.py @@ -54,6 +54,8 @@ deps = ["xorriso", "mformat"] # dependent tools needed in the test setup/box. supported_platforms = ['x86_64'] # supported test platforms. +# default timeout of 120 secs is sometimes not enough for bits test. +BITS_TIMEOUT = 200 def which(tool): """ looks up the full path for @tool, returns None if not found @@ -133,7 +135,7 @@ class AcpiBitsTest(QemuBaseTest): #pylint: disable=too-many-instance-attributes """ # in slower systems the test can take as long as 3 minutes to complete. - timeout = 200 + timeout = BITS_TIMEOUT def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -400,7 +402,8 @@ def test_acpi_smbios_bits(self): # biosbits has been configured to run all the specified test suites # in batch mode and then automatically initiate a vm shutdown. - # Rely on avocado's unit test timeout. - self._vm.event_wait('SHUTDOWN') + # Set timeout to BITS_TIMEOUT for SHUTDOWN event from bits VM at par + # with the avocado test timeout. + self._vm.event_wait('SHUTDOWN', timeout=BITS_TIMEOUT) self._vm.wait(timeout=None) self.parse_log() diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py index 270ccc1eae81340ff9b539f1fc6f713eac263f88..e95bff329999186728510a2663feebb29d888120 100644 --- a/tests/avocado/replay_linux.py +++ b/tests/avocado/replay_linux.py @@ -94,7 +94,7 @@ def launch_and_wait(self, record, args, shift): else: vm.event_wait('SHUTDOWN', self.timeout) vm.wait() - logger.info('successfully fihished the replay') + logger.info('successfully finished the replay') elapsed = time.time() - start_time logger.info('elapsed time %.2f sec' % elapsed) return elapsed diff --git a/tests/data/acpi/microvm/DSDT.pcie b/tests/data/acpi/microvm/DSDT.pcie index 765f14ef3d1e54d3cadccbf0a880f8adb73b3f1f..af4c3eb38866d5a32928a73b01ea49a946073aa6 100644 Binary files a/tests/data/acpi/microvm/DSDT.pcie and b/tests/data/acpi/microvm/DSDT.pcie differ diff --git a/tests/data/acpi/pc/DSDT b/tests/data/acpi/pc/DSDT index c93ad6b7f83a168a1833d7dba1112dd2ab8a431f..714279255b1c17241109cfaaf25a1e1ccb41259e 100644 Binary files a/tests/data/acpi/pc/DSDT and b/tests/data/acpi/pc/DSDT differ diff --git a/tests/data/acpi/pc/DSDT.acpierst b/tests/data/acpi/pc/DSDT.acpierst index f643fa2d034053fa07f74f095565b64f021d4290..c1655914728edabdf356ce41d076c7eeb84705ca 100644 Binary files a/tests/data/acpi/pc/DSDT.acpierst and b/tests/data/acpi/pc/DSDT.acpierst differ diff --git a/tests/data/acpi/pc/DSDT.acpihmat b/tests/data/acpi/pc/DSDT.acpihmat index 9d3695ff289036856886a093733926667a32a058..295e3d7bf86d080cefb9d1571cb84c7325a1d98d 100644 Binary files a/tests/data/acpi/pc/DSDT.acpihmat and b/tests/data/acpi/pc/DSDT.acpihmat differ diff --git a/tests/data/acpi/pc/DSDT.bridge b/tests/data/acpi/pc/DSDT.bridge index 840b45f354ac14c858d0af8fbd31e97949a65d4b..ff9694ee4cbd7213b1f9f9e684c6c9feb9564327 100644 Binary files a/tests/data/acpi/pc/DSDT.bridge and b/tests/data/acpi/pc/DSDT.bridge differ diff --git a/tests/data/acpi/pc/DSDT.cphp b/tests/data/acpi/pc/DSDT.cphp index dbc0141b2bbc77a6d806ff046dc137992c59a899..dfc25a2cb677843817768ed947df1a1ff5672c67 100644 Binary files a/tests/data/acpi/pc/DSDT.cphp and b/tests/data/acpi/pc/DSDT.cphp differ diff --git a/tests/data/acpi/pc/DSDT.dimmpxm b/tests/data/acpi/pc/DSDT.dimmpxm index 1294f655d418dbdccc095e0d47ab220869a61a07..6b74c6cdcb01aa2cb781715a8b595072da5c2fab 100644 Binary files a/tests/data/acpi/pc/DSDT.dimmpxm and b/tests/data/acpi/pc/DSDT.dimmpxm differ diff --git a/tests/data/acpi/pc/DSDT.hpbridge b/tests/data/acpi/pc/DSDT.hpbridge index 8012b5eb3155377dc7995b73059ecb267d19232c..39e93ce02b6cd6db664f5469c6b974675205ced8 100644 Binary files a/tests/data/acpi/pc/DSDT.hpbridge and b/tests/data/acpi/pc/DSDT.hpbridge differ diff --git a/tests/data/acpi/pc/DSDT.hpbrroot b/tests/data/acpi/pc/DSDT.hpbrroot index 4fa0c6fe720f7859f0541b82f828c0329a3c0548..06e1f09d7069c5917bdb56101551551273410adb 100644 Binary files a/tests/data/acpi/pc/DSDT.hpbrroot and b/tests/data/acpi/pc/DSDT.hpbrroot differ diff --git a/tests/data/acpi/pc/DSDT.ipmikcs b/tests/data/acpi/pc/DSDT.ipmikcs index 0a891baf458abee4a772ffba7a31914ec22418ec..950eab96f517ade58f5c479684d449191288e98c 100644 Binary files a/tests/data/acpi/pc/DSDT.ipmikcs and b/tests/data/acpi/pc/DSDT.ipmikcs differ diff --git a/tests/data/acpi/pc/DSDT.memhp b/tests/data/acpi/pc/DSDT.memhp index 9b442a64cf711b33d80691fe84f1d3a6256f943b..81533f080774c483a87d31b947e4ac6c7d4b81a8 100644 Binary files a/tests/data/acpi/pc/DSDT.memhp and b/tests/data/acpi/pc/DSDT.memhp differ diff --git a/tests/data/acpi/pc/DSDT.nohpet b/tests/data/acpi/pc/DSDT.nohpet index 1754c6878839fc657230e1e714cd7c5142e0a77e..b8b37e810e17176cfc3ce105bf3a8c97de627f8d 100644 Binary files a/tests/data/acpi/pc/DSDT.nohpet and b/tests/data/acpi/pc/DSDT.nohpet differ diff --git a/tests/data/acpi/pc/DSDT.numamem b/tests/data/acpi/pc/DSDT.numamem index 9fc731d3d2bcde5e2612a8ccd81e12098134afe9..6283080304b6b6d80346457a0f428867a9e4a09b 100644 Binary files a/tests/data/acpi/pc/DSDT.numamem and b/tests/data/acpi/pc/DSDT.numamem differ diff --git a/tests/data/acpi/pc/DSDT.roothp b/tests/data/acpi/pc/DSDT.roothp index e654c83ebe40c413b204c711adcefe3f04655e8c..19b5f0ceea9ad696e6fff21f0cb9955217b017ac 100644 Binary files a/tests/data/acpi/pc/DSDT.roothp and b/tests/data/acpi/pc/DSDT.roothp differ diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT index fb89ae0ac6d4346e33156e9e4d3718698a0a1a8e..b0bbff7686c9a56129bfa3408e62f142cc482713 100644 Binary files a/tests/data/acpi/q35/DSDT and b/tests/data/acpi/q35/DSDT differ diff --git a/tests/data/acpi/q35/DSDT.acpierst b/tests/data/acpi/q35/DSDT.acpierst index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644 Binary files a/tests/data/acpi/q35/DSDT.acpierst and b/tests/data/acpi/q35/DSDT.acpierst differ diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat index 61c5bd52a42242e85090934e8e45bf01642609d6..0949fb9d67c70dc882e50501ece421114ad8080b 100644 Binary files a/tests/data/acpi/q35/DSDT.acpihmat and b/tests/data/acpi/q35/DSDT.acpihmat differ diff --git a/tests/data/acpi/q35/DSDT.acpihmat-noinitiator b/tests/data/acpi/q35/DSDT.acpihmat-noinitiator index 3aaa2bbdf54a0d0cade14421e84c6ec5a42f96fa..0fa4daa35cf95f93ba8c15f478460fe4e14e6d9e 100644 Binary files a/tests/data/acpi/q35/DSDT.acpihmat-noinitiator and b/tests/data/acpi/q35/DSDT.acpihmat-noinitiator differ diff --git a/tests/data/acpi/q35/DSDT.applesmc b/tests/data/acpi/q35/DSDT.applesmc index 944209adeaa5bbb722431161c404cb51b8209993..a5d032b7d96113c9393036b2ba831adb6d584142 100644 Binary files a/tests/data/acpi/q35/DSDT.applesmc and b/tests/data/acpi/q35/DSDT.applesmc differ diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge index d9938dba8fa5d405f7696c0dbdc24f3ae42ec934..3464f552974672bde25eb15f1c93c309c57ef5cb 100644 Binary files a/tests/data/acpi/q35/DSDT.bridge and b/tests/data/acpi/q35/DSDT.bridge differ diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp index 20955d0aa30120553da35d5a6640055d26255cf9..7fd59bf6702c04a622f05ae356a2ea37312ab403 100644 Binary files a/tests/data/acpi/q35/DSDT.cphp and b/tests/data/acpi/q35/DSDT.cphp differ diff --git a/tests/data/acpi/q35/DSDT.cxl b/tests/data/acpi/q35/DSDT.cxl index 145301c52af9a17242bb306c210f8a7e0f01b827..27a1726af2d71417e0cd41b2a14bea0d9c410225 100644 Binary files a/tests/data/acpi/q35/DSDT.cxl and b/tests/data/acpi/q35/DSDT.cxl differ diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm index 228374b55bd544116e359f659e546fc66cf8a895..1db0bf454a203006f866e6752d06422ae675cbd3 100644 Binary files a/tests/data/acpi/q35/DSDT.dimmpxm and b/tests/data/acpi/q35/DSDT.dimmpxm differ diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt index 45f911ada5645f158f3d6c0c430ec1d52cadc5d8..25f43ae8efb55364a739e6b5e3cb4e71e61862b0 100644 Binary files a/tests/data/acpi/q35/DSDT.ipmibt and b/tests/data/acpi/q35/DSDT.ipmibt differ diff --git a/tests/data/acpi/q35/DSDT.ipmismbus b/tests/data/acpi/q35/DSDT.ipmismbus index e5d6811bee1233d74236453c49060390d74d4416..32bcd25bda9e9d2775790385f8da6a11e9d5cb46 100644 Binary files a/tests/data/acpi/q35/DSDT.ipmismbus and b/tests/data/acpi/q35/DSDT.ipmismbus differ diff --git a/tests/data/acpi/q35/DSDT.ivrs b/tests/data/acpi/q35/DSDT.ivrs index 46fd25400b7c00ee9149ddb64cb5d5bd73f6a82b..f91cbe55fcfeea319babf7c9a0c6a6ccdc3320d1 100644 Binary files a/tests/data/acpi/q35/DSDT.ivrs and b/tests/data/acpi/q35/DSDT.ivrs differ diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp index 5ce081187a578ba7145a9ba20d30be36c13b7663..be90eb71d8dda8fe54c79ffffe103986ee06ae3a 100644 Binary files a/tests/data/acpi/q35/DSDT.memhp and b/tests/data/acpi/q35/DSDT.memhp differ diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64 index bdf36c4d575bfc4eb2eac3f00c9b7b4270f88677..01f276a6aff38a1d4f58640a9e6d120fc9a04b61 100644 Binary files a/tests/data/acpi/q35/DSDT.mmio64 and b/tests/data/acpi/q35/DSDT.mmio64 differ diff --git a/tests/data/acpi/q35/DSDT.multi-bridge b/tests/data/acpi/q35/DSDT.multi-bridge index 1db43a69e4c2affd8bd678bbef4d3c228380288e..1bd2ee8d2ebd3c9e0ed89a86478691f2e06f2590 100644 Binary files a/tests/data/acpi/q35/DSDT.multi-bridge and b/tests/data/acpi/q35/DSDT.multi-bridge differ diff --git a/tests/data/acpi/q35/DSDT.noacpihp b/tests/data/acpi/q35/DSDT.noacpihp index 8bc16887e1c963c61aaecf71712a09c0554f6d67..45cc2bcffa42d73db110afd5075556dcfe5d9936 100644 Binary files a/tests/data/acpi/q35/DSDT.noacpihp and b/tests/data/acpi/q35/DSDT.noacpihp differ diff --git a/tests/data/acpi/q35/DSDT.nohpet b/tests/data/acpi/q35/DSDT.nohpet index c13e45e3612646cc2e30f00b3b7e53335da816ea..f110504b9c813aa07802fc17d2869596a2eeca6f 100644 Binary files a/tests/data/acpi/q35/DSDT.nohpet and b/tests/data/acpi/q35/DSDT.nohpet differ diff --git a/tests/data/acpi/q35/DSDT.numamem b/tests/data/acpi/q35/DSDT.numamem index ba6669437e65952f24516ded954b33fe54bdedfb..6090958f39875f5806e72e23f32cb4b3ae840627 100644 Binary files a/tests/data/acpi/q35/DSDT.numamem and b/tests/data/acpi/q35/DSDT.numamem differ diff --git a/tests/data/acpi/q35/DSDT.pvpanic-isa b/tests/data/acpi/q35/DSDT.pvpanic-isa index 6ad42873e91c80cef5a42224cb4d31936dad59b4..7a8e568315a43f1fa98068d8e78995c98064fb91 100644 Binary files a/tests/data/acpi/q35/DSDT.pvpanic-isa and b/tests/data/acpi/q35/DSDT.pvpanic-isa differ diff --git a/tests/data/acpi/q35/DSDT.tis.tpm12 b/tests/data/acpi/q35/DSDT.tis.tpm12 index e381ce4cbf2b11f56a2d0537db4d21acc97450c9..29a416f0508655d2bfde01fff4d25ad7f89581d9 100644 Binary files a/tests/data/acpi/q35/DSDT.tis.tpm12 and b/tests/data/acpi/q35/DSDT.tis.tpm12 differ diff --git a/tests/data/acpi/q35/DSDT.tis.tpm2 b/tests/data/acpi/q35/DSDT.tis.tpm2 index a09253042ce4a715922027245de8a2ab7449c5b7..59288f02c43cf2efc1555599131fde05dbbaa1cd 100644 Binary files a/tests/data/acpi/q35/DSDT.tis.tpm2 and b/tests/data/acpi/q35/DSDT.tis.tpm2 differ diff --git a/tests/data/acpi/q35/DSDT.viot b/tests/data/acpi/q35/DSDT.viot index 64e81f571120e3eb2b8c6c9545293a78c75b7bbd..d4d05ed620114ce793a59285e6237b566dd1390a 100644 Binary files a/tests/data/acpi/q35/DSDT.viot and b/tests/data/acpi/q35/DSDT.viot differ diff --git a/tests/data/acpi/virt/DSDT b/tests/data/acpi/virt/DSDT index c47503990715d389914fdf9c8bccb510761741ac..404bc5ac2188d885e28b6c80d499193865cf1c9c 100644 Binary files a/tests/data/acpi/virt/DSDT and b/tests/data/acpi/virt/DSDT differ diff --git a/tests/data/acpi/virt/DSDT.acpihmatvirt b/tests/data/acpi/virt/DSDT.acpihmatvirt index aee6ba017cd730948bfa93e91551eb10a6809293..5f9c0b2d3cdc55949c32d564c92309aa54529d8d 100644 Binary files a/tests/data/acpi/virt/DSDT.acpihmatvirt and b/tests/data/acpi/virt/DSDT.acpihmatvirt differ diff --git a/tests/data/acpi/virt/DSDT.memhp b/tests/data/acpi/virt/DSDT.memhp index bae36cdd397473afe3923c52f030641a5ab19d5d..d85565e3b7801de346e302e6e88bc76d88f33f27 100644 Binary files a/tests/data/acpi/virt/DSDT.memhp and b/tests/data/acpi/virt/DSDT.memhp differ diff --git a/tests/data/acpi/virt/DSDT.pxb b/tests/data/acpi/virt/DSDT.pxb index fbd78f44c4785d19759daea909fe6d6f9a6e6b01..ccb43ab242521cdfc80f6d6b170d2e0818186632 100644 Binary files a/tests/data/acpi/virt/DSDT.pxb and b/tests/data/acpi/virt/DSDT.pxb differ diff --git a/tests/data/acpi/virt/DSDT.topology b/tests/data/acpi/virt/DSDT.topology index 501314c91be01d927fd125e0c72e919fdd85592e..27fee07b86f6ab1b5672960b9e040fdc82185137 100644 Binary files a/tests/data/acpi/virt/DSDT.topology and b/tests/data/acpi/virt/DSDT.topology differ diff --git a/tests/data/acpi/virt/IORT b/tests/data/acpi/virt/IORT index 7efd0ce8a6b3928efa7e1373f688ab4c5f50543b..9f0958b3df5aa6b9c3092885f79a20d82da8f011 100644 Binary files a/tests/data/acpi/virt/IORT and b/tests/data/acpi/virt/IORT differ diff --git a/tests/data/acpi/virt/PPTT b/tests/data/acpi/virt/PPTT index 7a1258ecf123555b24462c98ccbb76b4ac1d0c2b..b89b2a9c71e0bc2713fc38f5de68fbc39b6302cb 100644 Binary files a/tests/data/acpi/virt/PPTT and b/tests/data/acpi/virt/PPTT differ diff --git a/tests/data/acpi/virt/PPTT.acpihmatvirt b/tests/data/acpi/virt/PPTT.acpihmatvirt index 4eef303a5b6168c6bc3795c2e2c53f65b4c4cfd4..945a73fff3b5c93367999199f06dbe7294f594b4 100644 Binary files a/tests/data/acpi/virt/PPTT.acpihmatvirt and b/tests/data/acpi/virt/PPTT.acpihmatvirt differ diff --git a/tests/data/acpi/virt/PPTT.topology b/tests/data/acpi/virt/PPTT.topology index 3fbcae5ff08aaf16fedf4da45e941661d79c1174..b0762c0a73eab106bab475f2a177e159b0b01c67 100644 Binary files a/tests/data/acpi/virt/PPTT.topology and b/tests/data/acpi/virt/PPTT.topology differ diff --git a/tests/docker/dockerfiles/debian-i686-cross.docker b/tests/docker/dockerfiles/debian-i686-cross.docker index 3fc4e15acdccf942975d70af5a2d87a269b0f5d6..e1c8e2b4948f14943ab68995729f5c3d3b70099f 100644 --- a/tests/docker/dockerfiles/debian-i686-cross.docker +++ b/tests/docker/dockerfiles/debian-i686-cross.docker @@ -1,10 +1,10 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool dockerfile --layers all --cross-arch i686 debian-11 qemu +# $ lcitool dockerfile --layers all --cross-arch i686 debian-12 qemu # # https://gitlab.com/libvirt/libvirt-ci -FROM docker.io/library/debian:11-slim +FROM docker.io/library/debian:12-slim RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ @@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ python3-opencv \ python3-pillow \ python3-pip \ - python3-setuptools \ python3-sphinx \ python3-sphinx-rtd-theme \ python3-venv \ - python3-wheel \ python3-yaml \ rpm2cpio \ sed \ socat \ sparse \ + swtpm \ tar \ tesseract-ocr \ tesseract-ocr-eng \ @@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \ dpkg-reconfigure locales -RUN /usr/bin/pip3 install tomli - ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers" ENV LANG "en_US.UTF-8" ENV MAKE "/usr/bin/make" @@ -145,6 +142,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ libvdeplug-dev:i386 \ libvirglrenderer-dev:i386 \ libvte-2.91-dev:i386 \ + libxdp-dev:i386 \ libzstd-dev:i386 \ nettle-dev:i386 \ systemtap-sdt-dev:i386 \ diff --git a/tests/docker/dockerfiles/debian-mipsel-cross.docker b/tests/docker/dockerfiles/debian-mipsel-cross.docker index 5fcd641f1552a8538d9510949b02ad77d6990472..79ce4ae503cd0c01bf262ec6450c3fa054409a51 100644 --- a/tests/docker/dockerfiles/debian-mipsel-cross.docker +++ b/tests/docker/dockerfiles/debian-mipsel-cross.docker @@ -1,10 +1,10 @@ # THIS FILE WAS AUTO-GENERATED # -# $ lcitool dockerfile --layers all --cross-arch mipsel debian-11 qemu +# $ lcitool dockerfile --layers all --cross-arch mipsel debian-12 qemu # # https://gitlab.com/libvirt/libvirt-ci -FROM docker.io/library/debian:11-slim +FROM docker.io/library/debian:12-slim RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update && \ @@ -47,16 +47,15 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ python3-opencv \ python3-pillow \ python3-pip \ - python3-setuptools \ python3-sphinx \ python3-sphinx-rtd-theme \ python3-venv \ - python3-wheel \ python3-yaml \ rpm2cpio \ sed \ socat \ sparse \ + swtpm \ tar \ tesseract-ocr \ tesseract-ocr-eng \ @@ -67,8 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ sed -Ei 's,^# (en_US\.UTF-8 .*)$,\1,' /etc/locale.gen && \ dpkg-reconfigure locales -RUN /usr/bin/pip3 install tomli - ENV CCACHE_WRAPPERSDIR "/usr/libexec/ccache-wrappers" ENV LANG "en_US.UTF-8" ENV MAKE "/usr/bin/make" @@ -143,6 +140,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ libvdeplug-dev:mipsel \ libvirglrenderer-dev:mipsel \ libvte-2.91-dev:mipsel \ + libxdp-dev:mipsel \ libzstd-dev:mipsel \ nettle-dev:mipsel \ systemtap-sdt-dev:mipsel \ diff --git a/tests/lcitool/refresh b/tests/lcitool/refresh index 0c93557ad6777fd845beafe1ddcfa32304425108..42ed7eba1dbc5881640f8cf56848de6336b05cff 100755 --- a/tests/lcitool/refresh +++ b/tests/lcitool/refresh @@ -159,7 +159,7 @@ try: trailer=cross_build("arm-linux-gnueabihf-", "arm-softmmu,arm-linux-user")) - generate_dockerfile("debian-i686-cross", "debian-11", + generate_dockerfile("debian-i686-cross", "debian-12", cross="i686", trailer=cross_build("x86_64-linux-gnu-", "x86_64-softmmu," @@ -171,7 +171,7 @@ try: trailer=cross_build("mips64el-linux-gnuabi64-", "mips64el-softmmu,mips64el-linux-user")) - generate_dockerfile("debian-mipsel-cross", "debian-11", + generate_dockerfile("debian-mipsel-cross", "debian-12", cross="mipsel", trailer=cross_build("mipsel-linux-gnu-", "mipsel-softmmu,mipsel-linux-user")) diff --git a/tests/qemu-iotests/049.out b/tests/qemu-iotests/049.out index 34e1b452e6e1725d18eece31b7339a11edc01adf..b4a9705ec2785c8d9f1f0126b8988ba83cccaaef 100644 --- a/tests/qemu-iotests/049.out +++ b/tests/qemu-iotests/049.out @@ -4,90 +4,90 @@ QA output created by 049 == 1. Traditional size parameter == qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024b -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1k -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1K -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1G -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1T -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1024.0b -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5k -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5K -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5G -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 TEST_DIR/t.qcow2 1.5T -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback == 2. Specifying size via -o == qemu-img create -f qcow2 -o size=1024 TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024b TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1k TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1K TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1M TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1G TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1073741824 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1T TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1099511627776 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024.0 TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1024.0b TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1024 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5k TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5K TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1536 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5M TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1572864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5G TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1610612736 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o size=1.5T TEST_DIR/t.qcow2 -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=1649267441664 lazy_refcounts=off refcount_bits=16 cache=writeback == 3. Invalid sizes == @@ -132,84 +132,84 @@ qemu-img: TEST_DIR/t.qcow2: The image size must be specified only once == Check correct interpretation of suffixes for cluster size == qemu-img create -f qcow2 -o cluster_size=1024 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024b TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1k TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1K TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1M TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1048576 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024.0 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=1024.0b TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=1024 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5k TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5K TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=512 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o cluster_size=0.5M TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=524288 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback == Check compat level option == qemu-img create -f qcow2 -o compat=0.10 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=1.1 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.42 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.42 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value '0.42' qemu-img create -f qcow2 -o compat=foobar TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=foobar lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'version' does not accept value 'foobar' == Check preallocation option == qemu-img create -f qcow2 -o preallocation=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o preallocation=metadata TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=metadata compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o preallocation=1234 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off preallocation=1234 compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Parameter 'preallocation' does not accept value '1234' == Check encryption option == qemu-img create -f qcow2 -o encryption=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=off cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 --object secret,id=sec0,data=123456 -o encryption=on,encrypt.key-secret=sec0 TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 encryption=on encrypt.key-secret=sec0 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 cache=writeback == Check lazy_refcounts option (only with v3) == qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=1.1,lazy_refcounts=on TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=1.1 lazy_refcounts=on refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=off TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=off refcount_bits=16 cache=writeback qemu-img create -f qcow2 -o compat=0.10,lazy_refcounts=on TEST_DIR/t.qcow2 64M -Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 +Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 compat=0.10 lazy_refcounts=on refcount_bits=16 cache=writeback qemu-img: TEST_DIR/t.qcow2: Lazy refcounts only supported with compatibility level 1.1 and above (use version=v3 or greater) == Expect error when backing file name is empty string == diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061 index 53c7d428e38548fac9659b5f710dade4419fae60..b71ac097d14bba1e6e3978459019e0c83e56f0e8 100755 --- a/tests/qemu-iotests/061 +++ b/tests/qemu-iotests/061 @@ -326,12 +326,14 @@ $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG" echo _make_test_img -o "compat=1.1,data_file=$TEST_IMG.data" 64M $QEMU_IMG amend -o "data_file=foo" "$TEST_IMG" -_img_info --format-specific +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts echo $QEMU_IMG amend -o "data_file=" --image-opts "data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -_img_info --format-specific +$QEMU_IO -c "read 0 4k" "$TEST_IMG" 2>&1 | _filter_testdir | _filter_imgfmt +$QEMU_IO -c "open -o data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" -c "read 0 4k" | _filter_qemu_io TEST_IMG="data-file.filename=$TEST_IMG.data,file.filename=$TEST_IMG" _img_info --format-specific --image-opts echo diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out index 139fc68177de57e33567094b84d6403cf27b3376..24c33add7ce6030ea8a12a38b128831b293245e9 100644 --- a/tests/qemu-iotests/061.out +++ b/tests/qemu-iotests/061.out @@ -545,7 +545,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 qemu-img: data-file can only be set for images that use an external data file Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'foo': No such file or directory +qemu-io: can't open device TEST_DIR/t.IMGFMT: Could not open 'foo': No such file or directory +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) image: TEST_DIR/t.IMGFMT file format: IMGFMT virtual size: 64 MiB (67108864 bytes) @@ -560,7 +562,9 @@ Format specific information: corrupt: false extended l2: false -qemu-img: Could not open 'TEST_DIR/t.IMGFMT': 'data-file' is required for this image +qemu-io: can't open device TEST_DIR/t.IMGFMT: 'data-file' is required for this image +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) image: TEST_DIR/t.IMGFMT file format: IMGFMT virtual size: 64 MiB (67108864 bytes) diff --git a/tests/qemu-iotests/099.out b/tests/qemu-iotests/099.out index 8cce6275295fb66acffdbd27c9a4ecfb910a2657..f6f8f259574f8d2e705ebbd551817c6c5875da32 100644 --- a/tests/qemu-iotests/099.out +++ b/tests/qemu-iotests/099.out @@ -1,6 +1,6 @@ QA output created by 099 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072 -Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072 +Formatting 'TEST_DIR/t.IMGFMT.compare', fmt=raw size=131072 cache=writeback === Testing simple filename for blkverify === diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108 index 54e935acf28a4d5006eab2e1c58c0a61e32257f6..a6fe261265845f8241fe1772dd433a16527ae720 100755 --- a/tests/qemu-iotests/108 +++ b/tests/qemu-iotests/108 @@ -55,7 +55,7 @@ _supported_os Linux _unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file # This test either needs sudo -n losetup or FUSE exports to work -if sudo -n losetup &>/dev/null; then +if test -c "/dev/loop-control" && sudo -n losetup &>/dev/null; then loopdev=true else loopdev=false diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244 index 3e61fa25bb6047b59fc2e8d3ed5596fc7317b02d..bb9cc6512f3545fc03a9f2089d34bd1270360c6b 100755 --- a/tests/qemu-iotests/244 +++ b/tests/qemu-iotests/244 @@ -215,9 +215,22 @@ $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" $QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" # blkdebug doesn't support copy offloading, so this tests the error path -$QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG" -$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" -$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" +test_img_with_blkdebug="json:{ + 'driver': 'qcow2', + 'file': { + 'driver': 'file', + 'filename': '$TEST_IMG' + }, + 'data-file': { + 'driver': 'blkdebug', + 'image': { + 'driver': 'file', + 'filename': '$TEST_IMG.data' + } + } +}" +$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$test_img_with_blkdebug" +$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$test_img_with_blkdebug" echo echo "=== Flushing should flush the data file ===" diff --git a/tests/qemu-iotests/270 b/tests/qemu-iotests/270 index 74352342db58c000b992d04a31cc5b23b246cefb..c37b674aa2e15e24840b68c15af98dcc2a601702 100755 --- a/tests/qemu-iotests/270 +++ b/tests/qemu-iotests/270 @@ -60,8 +60,16 @@ _make_test_img -o cluster_size=2M,data_file="$TEST_IMG.orig" \ # "write" 2G of data without using any space. # (qemu-img create does not like it, though, because null-co does not # support image creation.) -$QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \ - "$TEST_IMG" +test_img_with_null_data="json:{ + 'driver': '$IMGFMT', + 'file': { + 'filename': '$TEST_IMG' + }, + 'data-file': { + 'driver': 'null-co', + 'size':'4294967296' + } +}" # This gives us a range of: # 2^31 - 512 + 768 - 1 = 2^31 + 255 > 2^31 @@ -74,7 +82,7 @@ $QEMU_IMG amend -o data_file="json:{'driver':'null-co',,'size':'4294967296'}" \ # on L2 boundaries, we need large L2 tables; hence the cluster size of # 2 MB. (Anything from 256 kB should work, though, because then one L2 # table covers 8 GB.) -$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "write 768 $((2 ** 31 - 512))" "$test_img_with_null_data" | _filter_qemu_io _check_test_img diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap b/tests/qemu-iotests/tests/write-zeroes-unmap new file mode 100644 index 0000000000000000000000000000000000000000..7cfeeaf8391c918c90e60712cccf5b59d10add7a --- /dev/null +++ b/tests/qemu-iotests/tests/write-zeroes-unmap @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# group: quick +# +# Test write zeros unmap. +# +# Copyright (C) Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq="$(basename $0)" +echo "QA output created by $seq" + +trap _cleanup_test_img exit + +# get standard environment, filters and checks +cd .. +. ./common.rc +. ./common.filter + +_supported_fmt raw +_supported_proto file +_supported_os Linux + +create_test_image() { + _make_test_img -f $IMGFMT 1m +} + +filter_command() { + _filter_testdir | _filter_qemu_io | _filter_qemu | _filter_hmp +} + +print_disk_usage() { + du -sh $TEST_IMG | _filter_testdir +} + +echo +echo "=== defaults - write zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + +echo +echo "=== defaults - write zeros unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + + +echo +echo "=== defaults - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ + | filter_command +print_disk_usage + +echo +echo "=== discard=off - write zeroes unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=off \ + | filter_command +print_disk_usage + +echo +echo "=== detect-zeroes=on - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on \ + | filter_command +print_disk_usage + +echo +echo "=== detect-zeroes=on,discard=on - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,detect-zeroes=on,discard=on \ + | filter_command +print_disk_usage + +echo +echo "=== discard=on - write zeroes ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \ + | filter_command +print_disk_usage + +echo +echo "=== discard=on - write zeroes unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ + | $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT,discard=on \ + | filter_command +print_disk_usage diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap.out b/tests/qemu-iotests/tests/write-zeroes-unmap.out new file mode 100644 index 0000000000000000000000000000000000000000..c9319948976fd8d256e8979e8557fc783dcdde29 --- /dev/null +++ b/tests/qemu-iotests/tests/write-zeroes-unmap.out @@ -0,0 +1,81 @@ +QA output created by write-zeroes-unmap + +=== defaults - write zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -z 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== defaults - write zeros unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== defaults - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=off - write zeroes unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== detect-zeroes=on - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== detect-zeroes=on,discard=on - write actual zeros === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -P 0 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=on - write zeroes === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -z 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +1.0M TEST_DIR/t.raw + +=== discard=on - write zeroes unmap === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +QEMU X.Y.Z monitor - type 'help' for more information +(qemu) qemu-io none0 "write -zu 0 1m" +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +(qemu) quit +0 TEST_DIR/t.raw diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index fe6a9a8563ced0b7bd75b6d52c41f0ef9b813013..21811a1ab5c01b46102f357c138b87212271988c 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -1015,7 +1015,7 @@ static void test_acpi_q35_tcg(void) free_test_data(&data); } -static void test_acpi_q35_tcg_type4_count(void) +static void test_acpi_q35_kvm_type4_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1031,7 +1031,7 @@ static void test_acpi_q35_tcg_type4_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_core_count(void) +static void test_acpi_q35_kvm_core_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1048,7 +1048,7 @@ static void test_acpi_q35_tcg_core_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_core_count2(void) +static void test_acpi_q35_kvm_core_count2(void) { test_data data = { .machine = MACHINE_Q35, @@ -1065,7 +1065,7 @@ static void test_acpi_q35_tcg_core_count2(void) free_test_data(&data); } -static void test_acpi_q35_tcg_thread_count(void) +static void test_acpi_q35_kvm_thread_count(void) { test_data data = { .machine = MACHINE_Q35, @@ -1082,7 +1082,7 @@ static void test_acpi_q35_tcg_thread_count(void) free_test_data(&data); } -static void test_acpi_q35_tcg_thread_count2(void) +static void test_acpi_q35_kvm_thread_count2(void) { test_data data = { .machine = MACHINE_Q35, @@ -2262,15 +2262,15 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/kvm/xapic", test_acpi_q35_kvm_xapic); qtest_add_func("acpi/q35/kvm/dmar", test_acpi_q35_kvm_dmar); qtest_add_func("acpi/q35/type4-count", - test_acpi_q35_tcg_type4_count); + test_acpi_q35_kvm_type4_count); qtest_add_func("acpi/q35/core-count", - test_acpi_q35_tcg_core_count); + test_acpi_q35_kvm_core_count); qtest_add_func("acpi/q35/core-count2", - test_acpi_q35_tcg_core_count2); + test_acpi_q35_kvm_core_count2); qtest_add_func("acpi/q35/thread-count", - test_acpi_q35_tcg_thread_count); + test_acpi_q35_kvm_thread_count); qtest_add_func("acpi/q35/thread-count2", - test_acpi_q35_tcg_thread_count2); + test_acpi_q35_kvm_thread_count2); } if (qtest_has_device("virtio-iommu-pci")) { qtest_add_func("acpi/q35/viot", test_acpi_q35_viot); diff --git a/tests/qtest/libqos/loongarch-virt-machine.c b/tests/qtest/libqos/loongarch-virt-machine.c new file mode 100644 index 0000000000000000000000000000000000000000..c12089c015df6d7f18144e3fe99b588354e45329 --- /dev/null +++ b/tests/qtest/libqos/loongarch-virt-machine.c @@ -0,0 +1,114 @@ +/* + * libqos driver framework + * + * Copyright (c) 2018 Emanuele Giuseppe Esposito + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "qemu/osdep.h" +#include "../libqtest.h" +#include "qemu/module.h" +#include "libqos-malloc.h" +#include "qgraph.h" +#include "virtio-mmio.h" +#include "generic-pcihost.h" +#include "hw/pci/pci_regs.h" + +#define LOONGARCH_PAGE_SIZE 0x1000 +#define LOONGARCH_VIRT_RAM_ADDR 0x100000 +#define LOONGARCH_VIRT_RAM_SIZE 0xFF00000 + +#define LOONGARCH_VIRT_PIO_BASE 0x18000000 +#define LOONGARCH_VIRT_PCIE_PIO_OFFSET 0x4000 +#define LOONGARCH_VIRT_PCIE_PIO_LIMIT 0x10000 +#define LOONGARCH_VIRT_PCIE_ECAM_BASE 0x20000000 +#define LOONGARCH_VIRT_PCIE_MMIO32_BASE 0x40000000 +#define LOONGARCH_VIRT_PCIE_MMIO32_LIMIT 0x80000000 + +typedef struct QVirtMachine QVirtMachine; + +struct QVirtMachine { + QOSGraphObject obj; + QGuestAllocator alloc; + QVirtioMMIODevice virtio_mmio; + QGenericPCIHost bridge; +}; + +static void virt_destructor(QOSGraphObject *obj) +{ + QVirtMachine *machine = (QVirtMachine *) obj; + alloc_destroy(&machine->alloc); +} + +static void *virt_get_driver(void *object, const char *interface) +{ + QVirtMachine *machine = object; + if (!g_strcmp0(interface, "memory")) { + return &machine->alloc; + } + + fprintf(stderr, "%s not present in loongarch/virtio\n", interface); + g_assert_not_reached(); +} + +static QOSGraphObject *virt_get_device(void *obj, const char *device) +{ + QVirtMachine *machine = obj; + if (!g_strcmp0(device, "generic-pcihost")) { + return &machine->bridge.obj; + } else if (!g_strcmp0(device, "virtio-mmio")) { + return &machine->virtio_mmio.obj; + } + + fprintf(stderr, "%s not present in loongarch/virt\n", device); + g_assert_not_reached(); +} + +static void loongarch_config_qpci_bus(QGenericPCIBus *qpci) +{ + qpci->gpex_pio_base = LOONGARCH_VIRT_PIO_BASE; + qpci->bus.pio_alloc_ptr = LOONGARCH_VIRT_PCIE_PIO_OFFSET; + qpci->bus.pio_limit = LOONGARCH_VIRT_PCIE_PIO_LIMIT; + qpci->bus.mmio_alloc_ptr = LOONGARCH_VIRT_PCIE_MMIO32_BASE; + qpci->bus.mmio_limit = LOONGARCH_VIRT_PCIE_MMIO32_LIMIT; + qpci->ecam_alloc_ptr = LOONGARCH_VIRT_PCIE_ECAM_BASE; +} + +static void *qos_create_machine_loongarch_virt(QTestState *qts) +{ + QVirtMachine *machine = g_new0(QVirtMachine, 1); + + alloc_init(&machine->alloc, 0, + LOONGARCH_VIRT_RAM_ADDR, + LOONGARCH_VIRT_RAM_ADDR + LOONGARCH_VIRT_RAM_SIZE, + LOONGARCH_PAGE_SIZE); + + qos_create_generic_pcihost(&machine->bridge, qts, &machine->alloc); + loongarch_config_qpci_bus(&machine->bridge.pci); + + machine->obj.get_device = virt_get_device; + machine->obj.get_driver = virt_get_driver; + machine->obj.destructor = virt_destructor; + return machine; +} + +static void virt_machine_register_nodes(void) +{ + qos_node_create_machine_args("loongarch64/virt", + qos_create_machine_loongarch_virt, + " -cpu la464"); + qos_node_contains("loongarch64/virt", "generic-pcihost", NULL); +} + +libqos_init(virt_machine_register_nodes); diff --git a/tests/qtest/libqos/meson.build b/tests/qtest/libqos/meson.build index 90aae42a22535f0dd8fe2fe7f9a615dd1b9a805b..482c9b2aab99dc0ef0dc4d1c7a2ce6468ecb640a 100644 --- a/tests/qtest/libqos/meson.build +++ b/tests/qtest/libqos/meson.build @@ -60,6 +60,7 @@ libqos_srcs = files( 'arm-xilinx-zynq-a9-machine.c', 'ppc64_pseries-machine.c', 'x86_64_pc-machine.c', + 'loongarch-virt-machine.c', ) if have_virtfs diff --git a/tests/qtest/libqos/qgraph.h b/tests/qtest/libqos/qgraph.h index 287022a67c1db3dcb347e98bfc577e5ac2901883..1b5de02e7bebf3feaf368124d8fb81ae2c1003e6 100644 --- a/tests/qtest/libqos/qgraph.h +++ b/tests/qtest/libqos/qgraph.h @@ -24,7 +24,7 @@ #include "libqos-malloc.h" /* maximum path length */ -#define QOS_PATH_MAX_ELEMENT_SIZE 64 +#define QOS_PATH_MAX_ELEMENT_SIZE 128 typedef struct QOSGraphObject QOSGraphObject; typedef struct QOSGraphNode QOSGraphNode; diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index 24fb7b3525cf4ea023649dfc5de45f7c395b0876..164e09c299aaa9342803226e8ce8c9aa5b0f0315 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", args); + + if (!qdict_haskey(rsp, "return")) { + g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true); + g_test_message("%s", s->str); + } + g_assert(qdict_haskey(rsp, "return")); qobject_unref(rsp); @@ -292,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1, return find_common_machine_version(machine_name, var1, var2); } + +typedef struct { + char *name; + void (*func)(void); +} MigrationTest; + +static void migration_test_destroy(gpointer data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_free(test->name); + g_free(test); +} + +static void migration_test_wrapper(const void *data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_test_message("Running /%s%s", qtest_get_arch(), test->name); + test->func(); +} + +void migration_test_add(const char *path, void (*fn)(void)) +{ + MigrationTest *test = g_new0(MigrationTest, 1); + + test->func = fn; + test->name = g_strdup(path); + + qtest_add_data_func_full(path, test, migration_test_wrapper, + migration_test_destroy); +} diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h index e31dc85cc7524ba1830f30f066b273e51db3b44f..0d9a02edc7e451f436a7424ff2f67331aa264ad0 100644 --- a/tests/qtest/migration-helpers.h +++ b/tests/qtest/migration-helpers.h @@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1, const char *var2); char *resolve_machine_version(const char *alias, const char *var1, const char *var2); +void migration_test_add(const char *path, void (*fn)(void)); #endif /* MIGRATION_HELPERS_H */ diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 0fbaa6a90fd6c1dfe2fc3c4e7f05f6b1b3fc10d8..3385ca1f15386e1d0b50b21a24ee601180a2241b 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2560,6 +2560,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2568,10 +2575,42 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QATZIP +static void * +test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from, + QTestState *to) +{ + migrate_set_parameter_int(from, "multifd-qatzip-level", 2); + migrate_set_parameter_int(to, "multifd-qatzip-level", 2); + + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip"); +} +#endif + +#ifdef CONFIG_QPL +static void * +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); +} +#endif /* CONFIG_QPL */ +#ifdef CONFIG_UADK +static void * +test_migrate_precopy_tcp_multifd_uadk_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "uadk"); +} +#endif /* CONFIG_UADK */ + static void test_multifd_tcp_none(void) { MigrateCommon args = { @@ -2607,6 +2646,39 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QATZIP +static void test_multifd_tcp_qatzip(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start, + }; + test_precopy_common(&args); +} +#endif + +#ifdef CONFIG_QPL +static void test_multifd_tcp_qpl(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qpl_start, + }; + test_precopy_common(&args); +} +#endif + +#ifdef CONFIG_UADK +static void test_multifd_tcp_uadk(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_uadk_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3339,62 +3411,64 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); if (has_uffd) { - qtest_add_func("/migration/postcopy/plain", test_postcopy); - qtest_add_func("/migration/postcopy/recovery/plain", - test_postcopy_recovery); - qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt); - qtest_add_func("/migration/postcopy/preempt/recovery/plain", - test_postcopy_preempt_recovery); + migration_test_add("/migration/postcopy/plain", test_postcopy); + migration_test_add("/migration/postcopy/recovery/plain", + test_postcopy_recovery); + migration_test_add("/migration/postcopy/preempt/plain", + test_postcopy_preempt); + migration_test_add("/migration/postcopy/preempt/recovery/plain", + test_postcopy_preempt_recovery); if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/postcopy/compress/plain", - test_postcopy_compress); - qtest_add_func("/migration/postcopy/recovery/compress/plain", - test_postcopy_recovery_compress); + migration_test_add("/migration/postcopy/compress/plain", + test_postcopy_compress); + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); } #ifndef _WIN32 - qtest_add_func("/migration/postcopy/recovery/double-failures", - test_postcopy_recovery_double_fail); + migration_test_add("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); #endif /* _WIN32 */ - } - qtest_add_func("/migration/bad_dest", test_baddest); + migration_test_add("/migration/bad_dest", test_baddest); #ifndef _WIN32 if (!g_str_equal(arch, "s390x")) { - qtest_add_func("/migration/analyze-script", test_analyze_script); + migration_test_add("/migration/analyze-script", test_analyze_script); } #endif - qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); - qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); + migration_test_add("/migration/precopy/unix/plain", + test_precopy_unix_plain); + migration_test_add("/migration/precopy/unix/xbzrle", + test_precopy_unix_xbzrle); /* * Compression fails from time to time. * Put test here but don't enable it until everything is fixed. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/precopy/unix/compress/wait", - test_precopy_unix_compress); - qtest_add_func("/migration/precopy/unix/compress/nowait", - test_precopy_unix_compress_nowait); + migration_test_add("/migration/precopy/unix/compress/wait", + test_precopy_unix_compress); + migration_test_add("/migration/precopy/unix/compress/nowait", + test_precopy_unix_compress_nowait); } - qtest_add_func("/migration/precopy/file", - test_precopy_file); - qtest_add_func("/migration/precopy/file/offset", - test_precopy_file_offset); - qtest_add_func("/migration/precopy/file/offset/bad", - test_precopy_file_offset_bad); + migration_test_add("/migration/precopy/file", + test_precopy_file); + migration_test_add("/migration/precopy/file/offset", + test_precopy_file_offset); + migration_test_add("/migration/precopy/file/offset/bad", + test_precopy_file_offset_bad); /* * Our CI system has problems with shared memory. * Don't run this test until we find a workaround. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/mode/reboot", test_mode_reboot); + migration_test_add("/migration/mode/reboot", test_mode_reboot); } #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/unix/tls/psk", - test_precopy_unix_tls_psk); + migration_test_add("/migration/precopy/unix/tls/psk", + test_precopy_unix_tls_psk); if (has_uffd) { /* @@ -3402,110 +3476,120 @@ int main(int argc, char **argv) * channels are tested under precopy. Here what we want to test is the * general postcopy path that has TLS channel enabled. */ - qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk); - qtest_add_func("/migration/postcopy/recovery/tls/psk", - test_postcopy_recovery_tls_psk); - qtest_add_func("/migration/postcopy/preempt/tls/psk", - test_postcopy_preempt_tls_psk); - qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk", - test_postcopy_preempt_all); + migration_test_add("/migration/postcopy/tls/psk", + test_postcopy_tls_psk); + migration_test_add("/migration/postcopy/recovery/tls/psk", + test_postcopy_recovery_tls_psk); + migration_test_add("/migration/postcopy/preempt/tls/psk", + test_postcopy_preempt_tls_psk); + migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", + test_postcopy_preempt_all); } #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/unix/tls/x509/default-host", - test_precopy_unix_tls_x509_default_host); - qtest_add_func("/migration/precopy/unix/tls/x509/override-host", - test_precopy_unix_tls_x509_override_host); + migration_test_add("/migration/precopy/unix/tls/x509/default-host", + test_precopy_unix_tls_x509_default_host); + migration_test_add("/migration/precopy/unix/tls/x509/override-host", + test_precopy_unix_tls_x509_override_host); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain); - qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", - test_precopy_tcp_switchover_ack); + migration_test_add("/migration/precopy/tcp/plain/switchover-ack", + test_precopy_tcp_switchover_ack); #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/tcp/tls/psk/match", - test_precopy_tcp_tls_psk_match); - qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch", - test_precopy_tcp_tls_psk_mismatch); + migration_test_add("/migration/precopy/tcp/tls/psk/match", + test_precopy_tcp_tls_psk_match); + migration_test_add("/migration/precopy/tcp/tls/psk/mismatch", + test_precopy_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/tcp/tls/x509/default-host", - test_precopy_tcp_tls_x509_default_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/override-host", - test_precopy_tcp_tls_x509_override_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host", - test_precopy_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client", - test_precopy_tcp_tls_x509_friendly_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client", - test_precopy_tcp_tls_x509_hostile_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client", - test_precopy_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client", - test_precopy_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/default-host", + test_precopy_tcp_tls_x509_default_host); + migration_test_add("/migration/precopy/tcp/tls/x509/override-host", + test_precopy_tcp_tls_x509_override_host); + migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host", + test_precopy_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client", + test_precopy_tcp_tls_x509_friendly_client); + migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client", + test_precopy_tcp_tls_x509_hostile_client); + migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client", + test_precopy_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client", + test_precopy_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ + /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ #ifndef _WIN32 - qtest_add_func("/migration/fd_proto", test_migrate_fd_proto); + migration_test_add("/migration/fd_proto", test_migrate_fd_proto); #endif - qtest_add_func("/migration/validate_uuid", test_validate_uuid); - qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error); - qtest_add_func("/migration/validate_uuid_src_not_set", - test_validate_uuid_src_not_set); - qtest_add_func("/migration/validate_uuid_dst_not_set", - test_validate_uuid_dst_not_set); + migration_test_add("/migration/validate_uuid", test_validate_uuid); + migration_test_add("/migration/validate_uuid_error", + test_validate_uuid_error); + migration_test_add("/migration/validate_uuid_src_not_set", + test_validate_uuid_src_not_set); + migration_test_add("/migration/validate_uuid_dst_not_set", + test_validate_uuid_dst_not_set); /* * See explanation why this test is slow on function definition */ if (g_test_slow()) { - qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); + migration_test_add("/migration/auto_converge", + test_migrate_auto_converge); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit); + migration_test_add("/migration/dirty_limit", + test_migrate_dirty_limit); } } - qtest_add_func("/migration/multifd/tcp/plain/none", - test_multifd_tcp_none); - /* - * This test is flaky and sometimes fails in CI and otherwise: - * don't run unless user opts in via environment variable. - */ - if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/multifd/tcp/plain/cancel", + migration_test_add("/migration/multifd/tcp/plain/none", + test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/cancel", test_multifd_tcp_cancel); - } - qtest_add_func("/migration/multifd/tcp/plain/zlib", - test_multifd_tcp_zlib); + migration_test_add("/migration/multifd/tcp/plain/zlib", + test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD - qtest_add_func("/migration/multifd/tcp/plain/zstd", - test_multifd_tcp_zstd); + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); +#endif +#ifdef CONFIG_QATZIP + migration_test_add("/migration/multifd/tcp/plain/qatzip", + test_multifd_tcp_qatzip); +#endif +#ifdef CONFIG_QPL + migration_test_add("/migration/multifd/tcp/plain/qpl", + test_multifd_tcp_qpl); +#endif +#ifdef CONFIG_UADK + migration_test_add("/migration/multifd/tcp/plain/uadk", + test_multifd_tcp_uadk); #endif #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/multifd/tcp/tls/psk/match", - test_multifd_tcp_tls_psk_match); - qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch", - test_multifd_tcp_tls_psk_mismatch); + migration_test_add("/migration/multifd/tcp/tls/psk/match", + test_multifd_tcp_tls_psk_match); + migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", + test_multifd_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/multifd/tcp/tls/x509/default-host", - test_multifd_tcp_tls_x509_default_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/override-host", - test_multifd_tcp_tls_x509_override_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host", - test_multifd_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client", - test_multifd_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client", - test_multifd_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/default-host", + test_multifd_tcp_tls_x509_default_host); + migration_test_add("/migration/multifd/tcp/tls/x509/override-host", + test_multifd_tcp_tls_x509_override_host); + migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host", + test_multifd_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client", + test_multifd_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client", + test_multifd_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_ring", - test_precopy_unix_dirty_ring); - qtest_add_func("/migration/vcpu_dirty_limit", - test_vcpu_dirty_limit); + migration_test_add("/migration/dirty_ring", + test_precopy_unix_dirty_ring); + migration_test_add("/migration/vcpu_dirty_limit", + test_vcpu_dirty_limit); } ret = g_test_run(); diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c index 2c15f6095848fea7789bf62f41ee6b9128f67fde..df1f93ea6aade3d23ee294ea3795979151dd8f93 100644 --- a/tests/qtest/qmp-cmd-test.c +++ b/tests/qtest/qmp-cmd-test.c @@ -110,6 +110,7 @@ static bool query_is_ignored(const char *cmd) "query-sev-capabilities", "query-sgx", "query-sgx-capabilities", + "query-virtcca-capabilities", /* Success depends on enabling dirty page rate limit */ "query-vcpu-dirty-limit", NULL diff --git a/tests/qtest/tpm-tests.c b/tests/qtest/tpm-tests.c index fb94496bbd8614219ae8416e97c37740ad5e09ed..197714f8d99aeffebb7e32465e04489c267c7372 100644 --- a/tests/qtest/tpm-tests.c +++ b/tests/qtest/tpm-tests.c @@ -114,7 +114,7 @@ void tpm_test_swtpm_migration_test(const char *src_tpm_path, sizeof(tpm_pcrread_resp)); tpm_util_migrate(src_qemu, uri); - tpm_util_wait_for_migration_complete(src_qemu); + tpm_util_wait_for_migration_complete(dst_qemu); tpm_util_pcrread(dst_qemu, tx, tpm_pcrread_resp, sizeof(tpm_pcrread_resp)); diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f6630d376540e19051d22f2bc568151..fadf3f0f2e9d064962068c237c0ba20fc8a7cee7 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -799,6 +799,23 @@ static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) read_guest_mem_server(global_qtest, server); } +static void wait_for_rings_started(TestServer *s, size_t count) +{ + gint64 end_time; + + g_mutex_lock(&s->data_mutex); + end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; + while (ctpop64(s->rings) != count) { + if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) { + /* timeout has passed */ + g_assert_cmpint(ctpop64(s->rings), ==, count); + break; + } + } + + g_mutex_unlock(&s->data_mutex); +} + static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *s = arg; @@ -869,6 +886,7 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) qtest_qmp_eventwait(to, "RESUME"); g_assert(wait_for_fds(dest)); + wait_for_rings_started(dest, 2); read_guest_mem_server(to, dest); g_source_destroy(source); @@ -880,23 +898,6 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) g_string_free(dest_cmdline, true); } -static void wait_for_rings_started(TestServer *s, size_t count) -{ - gint64 end_time; - - g_mutex_lock(&s->data_mutex); - end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; - while (ctpop64(s->rings) != count) { - if (!g_cond_wait_until(&s->data_cond, &s->data_mutex, end_time)) { - /* timeout has passed */ - g_assert_cmpint(ctpop64(s->rings), ==, count); - break; - } - } - - g_mutex_unlock(&s->data_mutex); -} - static inline void test_server_connect(TestServer *server) { test_server_create_chr(server, ",reconnect=1"); diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target index cded1d01fcdc91e6cd4028b1951cb3c249874c75..6d593c63928853ebdf1ac374df1bf786c9a74af5 100644 --- a/tests/tcg/aarch64/Makefile.target +++ b/tests/tcg/aarch64/Makefile.target @@ -40,8 +40,9 @@ endif # Pauth Tests ifneq ($(CROSS_CC_HAS_ARMV8_3),) -AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5 +AARCH64_TESTS += pauth-1 pauth-2 pauth-4 pauth-5 test-2375 pauth-%: CFLAGS += -march=armv8.3-a +test-2375: CFLAGS += -march=armv8.3-a run-pauth-1: QEMU_OPTS += -cpu max run-pauth-2: QEMU_OPTS += -cpu max # Choose a cpu with FEAT_Pauth but without FEAT_FPAC for pauth-[45]. diff --git a/tests/tcg/aarch64/test-2375.c b/tests/tcg/aarch64/test-2375.c new file mode 100644 index 0000000000000000000000000000000000000000..84c7e7de7163fa5615c8912c18b4321b95f98078 --- /dev/null +++ b/tests/tcg/aarch64/test-2375.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2024 Linaro Ltd */ +/* See https://gitlab.com/qemu-project/qemu/-/issues/2375 */ + +#include + +int main(void) +{ + int r, z; + + asm("msr fpcr, %2\n\t" + "fjcvtzs %w0, %d3\n\t" + "cset %1, eq" + : "=r"(r), "=r"(z) + : "r"(0x01000000L), /* FZ = 1 */ + "w"(0xfcff00L)); /* denormal */ + + assert(r == 0); + assert(z == 0); + return 0; +} diff --git a/tests/unit/meson.build b/tests/unit/meson.build index a05d471090401cab996c76e2d229699e99096e9b..598ba41bb93096dc674fb3193d4877c22046046e 100644 --- a/tests/unit/meson.build +++ b/tests/unit/meson.build @@ -142,9 +142,6 @@ if have_system 'test-vmstate': [migration, io], 'test-yank': ['socket-helpers.c', qom, io, chardev] } - if config_host_data.get('CONFIG_INOTIFY1') - tests += {'test-util-filemonitor': []} - endif # Some tests: test-char, test-qdev-global-props, and test-qga, # are not runnable under TSan due to a known issue. diff --git a/tests/unit/ptimer-test.c b/tests/unit/ptimer-test.c index 04b5f4e3d03148010748956bf894c4db3cb55b0f..08240594bbd65bc501629e6e06b7773a0c453e39 100644 --- a/tests/unit/ptimer-test.c +++ b/tests/unit/ptimer-test.c @@ -763,6 +763,33 @@ static void check_oneshot_with_load_0(gconstpointer arg) ptimer_free(ptimer); } +static void check_freq_more_than_1000M(gconstpointer arg) +{ + const uint8_t *policy = arg; + ptimer_state *ptimer = ptimer_init(ptimer_trigger, NULL, *policy); + bool no_round_down = (*policy & PTIMER_POLICY_NO_COUNTER_ROUND_DOWN); + + triggered = false; + + ptimer_transaction_begin(ptimer); + ptimer_set_freq(ptimer, 2000000000); + ptimer_set_limit(ptimer, 8, 1); + ptimer_run(ptimer, 1); + ptimer_transaction_commit(ptimer); + + qemu_clock_step(3); + + g_assert_cmpuint(ptimer_get_count(ptimer), ==, no_round_down ? 3 : 2); + g_assert_false(triggered); + + qemu_clock_step(1); + + g_assert_cmpuint(ptimer_get_count(ptimer), ==, 0); + g_assert_true(triggered); + + ptimer_free(ptimer); +} + static void add_ptimer_tests(uint8_t policy) { char policy_name[256] = ""; @@ -857,6 +884,12 @@ static void add_ptimer_tests(uint8_t policy) policy_name), g_memdup2(&policy, 1), check_oneshot_with_load_0, g_free); g_free(tmp); + + g_test_add_data_func_full( + tmp = g_strdup_printf("/ptimer/freq_more_than_1000M policy=%s", + policy_name), + g_memdup2(&policy, 1), check_freq_more_than_1000M, g_free); + g_free(tmp); } static void add_all_ptimer_policies_comb_tests(void) diff --git a/tests/unit/test-char.c b/tests/unit/test-char.c index 649fdf64e1964998f88dec9089e673172e88e476..0cb26331900a642912155575f9763ca317ea259e 100644 --- a/tests/unit/test-char.c +++ b/tests/unit/test-char.c @@ -574,7 +574,7 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock) struct sockaddr_in other; SocketIdleData d = { 0, }; Chardev *chr; - CharBackend *be; + CharBackend stack_be, *be = &stack_be; socklen_t alen = sizeof(other); int ret; char buf[10]; @@ -590,7 +590,6 @@ static void char_udp_test_internal(Chardev *reuse_chr, int sock) chr = qemu_chr_new("client", tmp, NULL); g_assert_nonnull(chr); - be = g_alloca(sizeof(CharBackend)); qemu_chr_fe_init(be, chr, &error_abort); } diff --git a/tests/unit/test-crypto-cipher.c b/tests/unit/test-crypto-cipher.c index d9d9d078ff11c9942922d5228c7d3448a0320731..11ab1a54fca3991fe68c8f7ce180cd436e9194c0 100644 --- a/tests/unit/test-crypto-cipher.c +++ b/tests/unit/test-crypto-cipher.c @@ -382,6 +382,19 @@ static QCryptoCipherTestData test_data[] = { .plaintext = "90afe91bb288544f2c32dc239b2635e6", .ciphertext = "6cb4561c40bf0a9705931cb6d408e7fa", }, +#ifdef CONFIG_CRYPTO_SM4 + { + /* SM4, GB/T 32907-2016, Appendix A.1 */ + .path = "/crypto/cipher/sm4", + .alg = QCRYPTO_CIPHER_ALG_SM4, + .mode = QCRYPTO_CIPHER_MODE_ECB, + .key = "0123456789abcdeffedcba9876543210", + .plaintext = + "0123456789abcdeffedcba9876543210", + .ciphertext = + "681edf34d206965e86b3e94f536e4246", + }, +#endif { /* #1 32 byte key, 32 byte PTX */ .path = "/crypto/cipher/aes-xts-128-1", diff --git a/tests/unit/test-crypto-hash.c b/tests/unit/test-crypto-hash.c index 1f4abb822b6140bc49200921e686078c90fa11a6..61908e1769eaf56320e716a49e7a8e5070b80d56 100644 --- a/tests/unit/test-crypto-hash.c +++ b/tests/unit/test-crypto-hash.c @@ -42,6 +42,9 @@ "63b54e4cb2d2032b393994aa263c0dbb" \ "e00a9f2fe9ef6037352232a1eec55ee7" #define OUTPUT_RIPEMD160 "f3d658fad3fdfb2b52c9369cf0d441249ddfa8a0" +#ifdef CONFIG_CRYPTO_SM3 +#define OUTPUT_SM3 "d4a97db105b477b84c4f20ec9c31a6c814e2705a0b83a5a89748d75f0ef456a1" +#endif #define OUTPUT_MD5_B64 "Yo0gY3FWMDWrjvYvSSveyQ==" #define OUTPUT_SHA1_B64 "sudPJnWKOkIeUJzuBFJEt4dTzAI=" @@ -54,6 +57,10 @@ "7sVe5w==" #define OUTPUT_RIPEMD160_B64 "89ZY+tP9+ytSyTac8NRBJJ3fqKA=" +#ifdef CONFIG_CRYPTO_SM3 +#define OUTPUT_SM3_B64 "1Kl9sQW0d7hMTyDsnDGmyBTicFoLg6Wol0jXXw70VqE=" +#endif + static const char *expected_outputs[] = { [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5, [QCRYPTO_HASH_ALG_SHA1] = OUTPUT_SHA1, @@ -62,6 +69,9 @@ static const char *expected_outputs[] = { [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384, [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512, [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3, +#endif }; static const char *expected_outputs_b64[] = { [QCRYPTO_HASH_ALG_MD5] = OUTPUT_MD5_B64, @@ -71,6 +81,9 @@ static const char *expected_outputs_b64[] = { [QCRYPTO_HASH_ALG_SHA384] = OUTPUT_SHA384_B64, [QCRYPTO_HASH_ALG_SHA512] = OUTPUT_SHA512_B64, [QCRYPTO_HASH_ALG_RIPEMD160] = OUTPUT_RIPEMD160_B64, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = OUTPUT_SM3_B64, +#endif }; static const int expected_lens[] = { [QCRYPTO_HASH_ALG_MD5] = 16, @@ -80,6 +93,9 @@ static const int expected_lens[] = { [QCRYPTO_HASH_ALG_SHA384] = 48, [QCRYPTO_HASH_ALG_SHA512] = 64, [QCRYPTO_HASH_ALG_RIPEMD160] = 20, +#ifdef CONFIG_CRYPTO_SM3 + [QCRYPTO_HASH_ALG_SM3] = 32, +#endif }; static const char hex[] = "0123456789abcdef"; diff --git a/tests/unit/test-crypto-hmac.c b/tests/unit/test-crypto-hmac.c index 23eb724d94250f5c8a37c21fb4fd9f1dd874fca9..b1d04e9fcccafbde5bf8fbc114c9d67be9dfd4a1 100644 --- a/tests/unit/test-crypto-hmac.c +++ b/tests/unit/test-crypto-hmac.c @@ -76,6 +76,14 @@ static QCryptoHmacTestData test_data[] = { "94964ed4c1155b62b668c241d67279e5" "8a711676", }, +#ifdef CONFIG_CRYPTO_SM3 + { + .alg = QCRYPTO_HASH_ALG_SM3, + .hex_digest = + "760e3799332bc913819b930085360ddb" + "c05529261313d5b15b75bab4fd7ae91e", + }, +#endif }; static const char hex[] = "0123456789abcdef"; diff --git a/tests/unit/test-crypto-pbkdf.c b/tests/unit/test-crypto-pbkdf.c index 43c417f6b43389e848d3828174b0f2492c771825..3d76593c86c5e155b581a82c7a2d9a49c926f5e6 100644 --- a/tests/unit/test-crypto-pbkdf.c +++ b/tests/unit/test-crypto-pbkdf.c @@ -326,6 +326,22 @@ static QCryptoPbkdfTestData test_data[] = { "\xce\xbf\x91\x14\x8b\x5c\x48\x41", .nout = 32 }, +#ifdef CONFIG_CRYPTO_SM3 + { + .path = "/crypto/pbkdf/nonrfc/sm3/iter2", + .hash = QCRYPTO_HASH_ALG_SM3, + .iterations = 2, + .key = "password", + .nkey = 8, + .salt = "ATHENA.MIT.EDUraeburn", + .nsalt = 21, + .out = "\x48\x71\x1b\x58\xa3\xcb\xce\x06" + "\xba\xad\x77\xa8\xb5\xb9\xd8\x07" + "\x6a\xe2\xb3\x5b\x95\xce\xc8\xce" + "\xe7\xb1\xcb\xee\x61\xdf\x04\xea", + .nout = 32 + }, +#endif #if 0 { .path = "/crypto/pbkdf/nonrfc/whirlpool/iter1200", diff --git a/tests/unit/test-vmstate.c b/tests/unit/test-vmstate.c index 0b7d5ecd6838da416318210b39f83fbf9657277d..22c586eee02a2ad84fc1986463ca00bb9080d2ee 100644 --- a/tests/unit/test-vmstate.c +++ b/tests/unit/test-vmstate.c @@ -31,6 +31,7 @@ #include "../migration/savevm.h" #include "qemu/module.h" #include "io/channel-file.h" +#include "exec/memory.h" static int temp_fd; @@ -1479,6 +1480,11 @@ static void test_tmp_struct(void) g_assert_cmpint(obj.f, ==, 8); /* From the child->parent */ } +/* stub for ut */ +void memory_region_commit(void) +{ +} + int main(int argc, char **argv) { g_autofree char *temp_file = g_strdup_printf("%s/vmst.test.XXXXXX", diff --git a/ui/clipboard.c b/ui/clipboard.c index 3d14bffaf80fd446fd38bdb812075366fe02179c..b3f6fa3c9e1fa83889372e330ef4422e6c7fc587 100644 --- a/ui/clipboard.c +++ b/ui/clipboard.c @@ -163,9 +163,15 @@ void qemu_clipboard_set_data(QemuClipboardPeer *peer, } g_free(info->types[type].data); - info->types[type].data = g_memdup(data, size); - info->types[type].size = size; - info->types[type].available = true; + if (size) { + info->types[type].data = g_memdup2(data, size); + info->types[type].size = size; + info->types[type].available = true; + } else { + info->types[type].data = NULL; + info->types[type].size = 0; + info->types[type].available = false; + } if (update) { qemu_clipboard_update(info); diff --git a/ui/console-vc.c b/ui/console-vc.c index 9c13cc2981b02317b224be52166cfcb16418c4bf..b1903c3e48001a748a84ca752663aa311b1bf9f5 100644 --- a/ui/console-vc.c +++ b/ui/console-vc.c @@ -648,7 +648,7 @@ static void vc_putchar(VCChardev *vc, int ch) QemuTextConsole *s = vc->console; int i; int x, y; - char response[40]; + g_autofree char *response = NULL; switch(vc->state) { case TTY_STATE_NORM: @@ -821,7 +821,7 @@ static void vc_putchar(VCChardev *vc, int ch) break; case 6: /* report cursor position */ - sprintf(response, "\033[%d;%dR", + response = g_strdup_printf("\033[%d;%dR", (s->y_base + s->y) % s->total_height + 1, s->x + 1); vc_respond_str(vc, response); diff --git a/ui/curses.c b/ui/curses.c index 8bde8c5cf7c35ca85295cd1a46fef27df00ce1a5..26438486fc97a2fc1b2e872a41fe0a9d73c1ac7e 100644 --- a/ui/curses.c +++ b/ui/curses.c @@ -38,7 +38,7 @@ #include "ui/input.h" #include "sysemu/sysemu.h" -#if defined(__APPLE__) || defined(__OpenBSD__) +#ifdef __APPLE__ #define _XOPEN_SOURCE_EXTENDED 1 #endif diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c index 3af5ac5bcf336244efbdb42ef08d5754364d0927..75f6b9011a5caf049dee4cafe15f0b1d3f8c6adf 100644 --- a/ui/gtk-egl.c +++ b/ui/gtk-egl.c @@ -150,6 +150,7 @@ void gd_egl_refresh(DisplayChangeListener *dcl) vc, vc->window ? vc->window : vc->gfx.drawing_area); if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) { + gd_egl_draw(vc); return; } diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c index 52dcac161e25759e06460109430f4a8ce993895e..4fff957c3f08028337a7db4e43531fad9eefe10c 100644 --- a/ui/gtk-gl-area.c +++ b/ui/gtk-gl-area.c @@ -126,6 +126,7 @@ void gd_gl_area_refresh(DisplayChangeListener *dcl) gd_update_monitor_refresh_rate(vc, vc->window ? vc->window : vc->gfx.drawing_area); if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) { + gd_gl_area_draw(vc); return; } diff --git a/ui/gtk.c b/ui/gtk.c index 810d7fc7960735e03bde904a62147743b935ff28..1a69f6fc3784876273a47ce096d38ed4cc7cf9d9 100644 --- a/ui/gtk.c +++ b/ui/gtk.c @@ -887,7 +887,7 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, int x, y; int mx, my; int fbh, fbw; - int ww, wh, ws; + int ww, wh; if (!vc->gfx.ds) { return TRUE; @@ -898,8 +898,13 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, ww = gtk_widget_get_allocated_width(widget); wh = gtk_widget_get_allocated_height(widget); - ws = gtk_widget_get_scale_factor(widget); + /* + * `widget` may not have the same size with the frame buffer. + * In such cases, some paddings are needed around the `vc`. + * To achieve that, `vc` will be displayed at (mx, my) + * so that it is displayed at the center of the widget. + */ mx = my = 0; if (ww > fbw) { mx = (ww - fbw) / 2; @@ -908,8 +913,12 @@ static gboolean gd_motion_event(GtkWidget *widget, GdkEventMotion *motion, my = (wh - fbh) / 2; } - x = (motion->x - mx) / vc->gfx.scale_x * ws; - y = (motion->y - my) / vc->gfx.scale_y * ws; + /* + * `motion` is reported in `widget` coordinates + * so translating it to the coordinates in `vc`. + */ + x = (motion->x - mx) / vc->gfx.scale_x; + y = (motion->y - my) / vc->gfx.scale_y; if (qemu_input_is_absolute(vc->gfx.dcl.con)) { if (x < 0 || y < 0 || diff --git a/ui/qemu-pixman.c b/ui/qemu-pixman.c index 5ca55dd19984969ec113fb9d8b105f33e7f4632a..6cada8b45e17bf6fd7fdf5d52b5a88f4be934358 100644 --- a/ui/qemu-pixman.c +++ b/ui/qemu-pixman.c @@ -49,7 +49,6 @@ PixelFormat qemu_pixelformat_from_pixman(pixman_format_code_t format) break; default: g_assert_not_reached(); - break; } pf.amax = (1 << pf.abits) - 1; diff --git a/ui/sdl2.c b/ui/sdl2.c index 4971963f0082b98a327117cd9180640d80600462..cc44d2708b698024d9a3e599c216449e38365981 100644 --- a/ui/sdl2.c +++ b/ui/sdl2.c @@ -115,6 +115,7 @@ void sdl2_window_create(struct sdl2_console *scon) SDL_SetHint(SDL_HINT_RENDER_BATCHING, "1"); scon->winctx = SDL_GL_CreateContext(scon->real_window); + SDL_GL_SetSwapInterval(0); } else { /* The SDL renderer is only used by sdl2-2D, when OpenGL is disabled */ scon->real_renderer = SDL_CreateRenderer(scon->real_window, -1, 0); diff --git a/ui/vnc-auth-sasl.c b/ui/vnc-auth-sasl.c index 47fdae5b215b93b7ea3c9c52ce7bdeee051e9dfc..e321c9decce52d9f070e12b67dc013aeda089f5f 100644 --- a/ui/vnc-auth-sasl.c +++ b/ui/vnc-auth-sasl.c @@ -674,6 +674,13 @@ void start_auth_sasl(VncState *vs) } trace_vnc_auth_sasl_mech_list(vs, mechlist); + if (g_str_equal(mechlist, "")) { + trace_vnc_auth_fail(vs, vs->auth, "no available SASL mechanisms", ""); + sasl_dispose(&vs->sasl.conn); + vs->sasl.conn = NULL; + goto authabort; + } + vs->sasl.mechlist = g_strdup(mechlist); mechlistlen = strlen(mechlist); vnc_write_u32(vs, mechlistlen); diff --git a/ui/vnc.c b/ui/vnc.c index 4f23a0fa79f5d7f979dad0d54b485d1d37c97692..5dd77e73cbb4b1962fcd19d20e01300713616eff 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -1365,6 +1365,8 @@ void vnc_disconnect_finish(VncState *vs) g_free(vs->zrle); g_free(vs->tight); g_free(vs); + + qemu_timer_set_mode(QEMU_TIMER_USB_LAZY_MODE, QEMU_USB_CONTROLLER_UHCI); } size_t vnc_client_io_error(VncState *vs, ssize_t ret, Error *err) @@ -3341,6 +3343,8 @@ static void vnc_connect(VncDisplay *vd, QIOChannelSocket *sioc, } } } + + qemu_timer_set_mode(QEMU_TIMER_USB_NORMAL_MODE, QEMU_USB_CONTROLLER_UHCI); } void vnc_start_protocol(VncState *vs) diff --git a/util/chardev_open.c b/util/chardev_open.c new file mode 100644 index 0000000000000000000000000000000000000000..f7764297882f59411c81b304a1ab5d16e903a389 --- /dev/null +++ b/util/chardev_open.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * Copyright (C) 2023 Intel Corporation. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Yi Liu + * + * Copied from + * https://github.com/linux-rdma/rdma-core/blob/master/util/open_cdev.c + * + */ + +#include "qemu/osdep.h" +#include "qemu/chardev_open.h" + +static int open_cdev_internal(const char *path, dev_t cdev) +{ + struct stat st; + int fd; + + fd = qemu_open_old(path, O_RDWR); + if (fd == -1) { + return -1; + } + if (fstat(fd, &st) || !S_ISCHR(st.st_mode) || + (cdev != 0 && st.st_rdev != cdev)) { + close(fd); + return -1; + } + return fd; +} + +static int open_cdev_robust(dev_t cdev) +{ + g_autofree char *devpath = NULL; + + /* + * This assumes that udev is being used and is creating the /dev/char/ + * symlinks. + */ + devpath = g_strdup_printf("/dev/char/%u:%u", major(cdev), minor(cdev)); + return open_cdev_internal(devpath, cdev); +} + +int open_cdev(const char *devpath, dev_t cdev) +{ + int fd; + + fd = open_cdev_internal(devpath, cdev); + if (fd == -1 && cdev != 0) { + return open_cdev_robust(cdev); + } + return fd; +} diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c index 7b304c79d94204e78022211b6be49257ebc1fab8..650c21846dd19567dc2ec8fa6f2f2cf10db1e2fe 100644 --- a/util/coroutine-ucontext.c +++ b/util/coroutine-ucontext.c @@ -80,6 +80,19 @@ union cc_arg { int i[2]; }; +/** + * coroutines list for libcare + */ +struct CoroutineInformation { + sigjmp_buf *env; + QLIST_ENTRY(CoroutineInformation) next; +}; + +static QemuMutex coro_mtx; +QLIST_HEAD(, CoroutineInformation) coro_info_list = QLIST_HEAD_INITIALIZER(pool); +int coro_env_offset = offsetof(struct CoroutineInformation, env); +int coro_next_offset = offsetof(struct CoroutineInformation, next); + /* * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it. * always_inline is required to avoid TSan runtime fatal errors. @@ -340,3 +353,42 @@ bool qemu_in_coroutine(void) return self && self->caller; } + +static void __attribute__((constructor)) coro_mutex_init(void) +{ + qemu_mutex_init(&coro_mtx); +} + +void qemu_coroutine_info_add(const Coroutine *co_) +{ + CoroutineUContext *co; + struct CoroutineInformation *coro_info; + + /* save coroutine env to coro_info_list */ + co = DO_UPCAST(CoroutineUContext, base, co_); + coro_info = g_malloc0(sizeof(struct CoroutineInformation)); + coro_info->env = &co->env; + + qemu_mutex_lock(&coro_mtx); + QLIST_INSERT_HEAD(&coro_info_list, coro_info, next); + qemu_mutex_unlock(&coro_mtx); +} + +void qemu_coroutine_info_delete(const Coroutine *co_) +{ + CoroutineUContext *co; + struct CoroutineInformation *coro_info; + + /* Remove relative coroutine env info from coro_info_list */ + co = DO_UPCAST(CoroutineUContext, base, co_); + + qemu_mutex_lock(&coro_mtx); + QLIST_FOREACH(coro_info, &coro_info_list, next) { + if (coro_info->env == &co->env) { + QLIST_REMOVE(coro_info, next); + g_free(coro_info); + break; + } + } + qemu_mutex_unlock(&coro_mtx); +} diff --git a/util/log.c b/util/log.c index d36c98da0b4ee251bd79f8ff5eda3250ec3bd967..78b6cf225f8de78a0d0092d78d72c3aeac16138d 100644 --- a/util/log.c +++ b/util/log.c @@ -143,6 +143,12 @@ void qemu_log_unlock(FILE *logfile) } } +#ifdef CONFIG_DISABLE_QEMU_LOG +void qemu_log(const char *fmt, ...) +{ + return; +} +#else void qemu_log(const char *fmt, ...) { FILE *f = qemu_log_trylock(); @@ -155,6 +161,7 @@ void qemu_log(const char *fmt, ...) qemu_log_unlock(f); } } +#endif static void __attribute__((__constructor__)) startup(void) { diff --git a/util/meson.build b/util/meson.build index c2322ef6e71a1de643d44dd3ad4bad497bc975ec..174c133368a8584315f488786a3d4b59f6cede80 100644 --- a/util/meson.build +++ b/util/meson.build @@ -108,6 +108,7 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('chardev_open.c')) endif if cpu == 'aarch64' diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index ed14f9c64de510e1add0e1b1e29844e43a36f012..6890ad676c434cd8047c739d8bb9086a75113a0a 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -30,6 +30,28 @@ #include #endif +size_t qemu_fd_getfiletype(int fd) +{ + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret != 0) { + fprintf(stderr, "Couldn't fstatfs() fd: %s\n", + strerror(errno)); + return -1; + } + return fs.f_type; + } else { + fprintf(stderr, "fd is invalid \n"); + return -1; + } +} + QemuFsType qemu_fd_getfs(int fd) { #ifdef CONFIG_LINUX diff --git a/util/module.c b/util/module.c index 32e263163c75dfa8e93ca67c739d4a501162353c..3eb0f06df165503f81c3cbc6657557666307506c 100644 --- a/util/module.c +++ b/util/module.c @@ -354,13 +354,13 @@ int module_load_qom(const char *type, Error **errp) void module_load_qom_all(void) { const QemuModinfo *modinfo; - Error *local_err = NULL; if (module_loaded_qom_all) { return; } for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + Error *local_err = NULL; if (!modinfo->objs) { continue; } diff --git a/util/oslib-posix.c b/util/oslib-posix.c index e86fd64e099877cec2818d948da659d261fde040..43af077fedd3257b7991b2dcf28864a4d723c8fc 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -88,6 +88,8 @@ static QemuMutex sigbus_mutex; static QemuMutex page_mutex; static QemuCond page_cond; +static int started_num_threads; + int qemu_get_thread_id(void) { #if defined(__linux__) @@ -344,6 +346,10 @@ static void *do_touch_pages(void *arg) } qemu_mutex_unlock(&page_mutex); + while (started_num_threads != memset_args->context->num_threads) { + smp_mb(); + } + /* unblock SIGBUS */ sigemptyset(&set); sigaddset(&set, SIGBUS); @@ -448,7 +454,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, context.threads = g_new0(MemsetThread, context.num_threads); numpages_per_thread = numpages / context.num_threads; leftover = numpages % context.num_threads; - for (i = 0; i < context.num_threads; i++) { + for (i = 0, started_num_threads = 0; i < context.num_threads; i++) { context.threads[i].addr = addr; context.threads[i].numpages = numpages_per_thread + (i < leftover); context.threads[i].hpagesize = hpagesize; @@ -464,6 +470,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, QEMU_THREAD_JOINABLE); } addr += context.threads[i].numpages * hpagesize; + started_num_threads++; } if (!use_madv_populate_write) { diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 5fd2dbaf8bb7818f6b57059f90fea5305796612e..f550214484ffff5e1cc5f512abaf1c2aa40926d7 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -89,6 +89,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) co = qemu_coroutine_new(); } + qemu_coroutine_info_add(co); + co->entry = entry; co->entry_arg = opaque; QSIMPLEQ_INIT(&co->co_queue_wakeup); @@ -99,6 +101,8 @@ static void coroutine_delete(Coroutine *co) { co->caller = NULL; + qemu_coroutine_info_delete(co); + if (IS_ENABLED(CONFIG_COROUTINE_POOL)) { if (release_pool_size < qatomic_read(&pool_max_size) * 2) { QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); diff --git a/util/qemu-timer.c b/util/qemu-timer.c index 6a0de33dd2b86c339dc9dc5abe6548e44c918cb4..dc891cc557db84d354fc476b8fc002a1d3c4e67b 100644 --- a/util/qemu-timer.c +++ b/util/qemu-timer.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "qemu/main-loop.h" #include "qemu/timer.h" #include "qemu/lockable.h" @@ -75,6 +76,74 @@ struct QEMUTimerList { QemuEvent timers_done_ev; }; +typedef struct qemu_controller_timer_state { + qemu_usb_controller_ptr controller; + int refs; +} controller_timer_state; + +typedef controller_timer_state* controller_timer_state_ptr; + +static controller_timer_state uhci_timer_state = { + .controller = NULL, + .refs = 0, +}; + +static controller_timer_state_ptr \ + qemu_usb_controller_tab[MAX_USB_CONTROLLER_TYPES] = {NULL, + &uhci_timer_state, + NULL, NULL}; + +int qemu_register_usb_controller(qemu_usb_controller_ptr controller, + unsigned int type) +{ + if (type != QEMU_USB_CONTROLLER_UHCI) { + return 0; + } + + /* for companion EHCI controller will create three UHCI controllers, + * we init it only once. + */ + if (!qemu_usb_controller_tab[type]->controller) { + qemu_log("the usb controller (%d) registed frame handler\n", type); + qemu_usb_controller_tab[type]->controller = controller; + } + + return 0; +} + +int qemu_timer_set_mode(enum qemu_timer_mode mode, unsigned int type) +{ + if (type != QEMU_USB_CONTROLLER_UHCI) { + qemu_log("the usb controller (%d) no need change frame frep\n", type); + return 0; + } + + if (!qemu_usb_controller_tab[type]->controller) { + qemu_log("the usb controller (%d) not registed yet\n", type); + return 0; + } + + if (mode == QEMU_TIMER_USB_NORMAL_MODE) { + if (qemu_usb_controller_tab[type]->refs++ > 0) { + return 0; + } + qemu_usb_controller_tab[type]->controller-> + qemu_set_freq(QEMU_USB_NORMAL_FREQ); + qemu_log("Set the controller (%d) of freq %d HZ,\n", + type, QEMU_USB_NORMAL_FREQ); + } else { + if (--qemu_usb_controller_tab[type]->refs > 0) { + return 0; + } + qemu_usb_controller_tab[type]->controller-> + qemu_set_freq(QEMU_USB_LAZY_FREQ); + qemu_log("Set the controller(type:%d) of freq %d HZ,\n", + type, QEMU_USB_LAZY_FREQ); + } + + return 0; +} + /** * qemu_clock_ptr: * @type: type of clock diff --git a/util/range.c b/util/range.c index f3f40098d547b99af662c0ec89f5eb312d296167..2ea640662b193b9c08699e21a36c06b37f1b535b 100644 --- a/util/range.c +++ b/util/range.c @@ -61,6 +61,7 @@ GList *range_list_insert(GList *list, Range *data) range_extend(l->data, l->next->data); g_free(l->next->data); new_l = g_list_delete_link(list, l->next); + l->next = NULL; assert(new_l == list); } diff --git a/util/userfaultfd.c b/util/userfaultfd.c index fdff4867e8bd8c0adaac44a4aec468c198214897..b7d320d0b1318842c6f75184e4851fa087f69588 100644 --- a/util/userfaultfd.c +++ b/util/userfaultfd.c @@ -356,31 +356,3 @@ int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) return (int) (res / sizeof(struct uffd_msg)); } - -/** - * uffd_poll_events: poll UFFD file descriptor for read - * - * Returns true if events are available for read, false otherwise - * - * @uffd_fd: UFFD file descriptor - * @tmo: timeout value - */ -bool uffd_poll_events(int uffd_fd, int tmo) -{ - int res; - struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; - - do { - res = poll(&poll_fd, 1, tmo); - } while (res < 0 && errno == EINTR); - - if (res == 0) { - return false; - } - if (res < 0) { - error_report("uffd_poll_events() failed: errno=%i", errno); - return false; - } - - return (poll_fd.revents & POLLIN) != 0; -}