From e92f2720038bf5ccb55a56a12fc67066e6137cbd Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Wed, 22 Feb 2023 13:59:37 +0800 Subject: [PATCH 01/23] Update linux headers to v6.0-rc4 commit d525f73f9186a5bc641b8caf0b2c9bb94e5aa963 upstream. commit 7e18e42e4b280c85b76967a9106a13ca61c16179 Intel-SIG: d525f73f9186 ("Update linux headers to v6.0-rc4"). Backport notify-vm-exit for qemu v6.2.0 Signed-off-by: Chenyi Qiang Reviewed-by: Cornelia Huck Message-Id: <20220915091035.3897-3-chenyi.qiang@intel.com> Signed-off-by: Thomas Huth [ jason: only include missing header changes for notify-vm-exit ] Signed-off-by: Jason Zeng --- linux-headers/asm-x86/kvm.h | 6 +++++- linux-headers/linux/kvm.h | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index bf6e96011d..8791f7c228 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -325,6 +325,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +360,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 02bba131e6..eb069b0f31 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -270,6 +270,7 @@ struct kvm_xen_exit { #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 #define KVM_EXIT_RISCV_SBI 35 +#define KVM_EXIT_NOTIFY 37 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -492,6 +493,11 @@ struct kvm_run { unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + /* KVM_EXIT_NOTIFY */ + struct { +#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 flags; + } notify; /* Fix the size of the union. */ char padding[256]; }; @@ -1152,6 +1158,8 @@ struct kvm_ppc_resize_hpt { /* #define KVM_CAP_VM_TSC_CONTROL 214 */ #define KVM_CAP_SYSTEM_EVENT_DATA 215 #define KVM_CAP_S390_PROTECTED_DUMP 217 +#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 +#define KVM_CAP_X86_NOTIFY_VMEXIT 219 #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 #define KVM_CAP_SEV_ES_GHCB 500 @@ -2285,4 +2293,8 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */ +#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) +#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) + #endif /* __LINUX_KVM_H */ -- Gitee From dfbfc08fef801098920789513d0722900c598825 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 29 Sep 2022 15:20:11 +0800 Subject: [PATCH 02/23] i386: kvm: extend kvm_{get, put}_vcpu_events to support pending triple fault commit 12f89a39cf3c5760cba82ce68929d748961f62df upstream. For the direct triple faults, i.e. hardware detected and KVM morphed to VM-Exit, KVM will never lose them. But for triple faults sythesized by KVM, e.g. the RSM path, if KVM exits to userspace before the request is serviced, userspace could migrate the VM and lose the triple fault. A new flag KVM_VCPUEVENT_VALID_TRIPLE_FAULT is defined to signal that the event.triple_fault_pending field contains a valid state if the KVM_CAP_X86_TRIPLE_FAULT_EVENT capability is enabled. Intel-SIG: commit 12f89a39cf3c ("i386: kvm: extend kvm_{get, put}_vcpu_events to support pending triple fault"). Backport notify-vm-exit for qemu v6.2.0 Acked-by: Peter Xu Signed-off-by: Chenyi Qiang Message-Id: <20220929072014.20705-2-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini [ jason: amend commit message ] Signed-off-by: Jason Zeng --- target/i386/cpu.c | 1 + target/i386/cpu.h | 1 + target/i386/kvm/kvm.c | 20 ++++++++++++++++++++ target/i386/machine.c | 20 ++++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 62149367b1..67b3593285 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6577,6 +6577,7 @@ static void x86_cpu_reset(DeviceState *dev) env->exception_has_payload = false; env->exception_payload = 0; env->nmi_injected = false; + env->triple_fault_pending = false; #if !defined(CONFIG_USER_ONLY) /* We hard-wire the BSP to the first CPU. */ apic_designate_bsp(cpu->apic_state, s->cpu_index == 0); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index e84cb8265c..c415bb3db8 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1736,6 +1736,7 @@ typedef struct CPUX86State { uint8_t has_error_code; uint8_t exception_has_payload; uint64_t exception_payload; + uint8_t triple_fault_pending; uint32_t ins_len; uint32_t sipi_vector; bool tsc_valid; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a14debcfc2..9541a1aed7 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -129,6 +129,7 @@ static int has_xcrs; static int has_pit_state2; static int has_exception_payload; static int has_map_gpa_range; +static int has_triple_fault_event; static bool has_msr_mcg_ext_ctl; @@ -2472,6 +2473,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT); + if (has_triple_fault_event) { + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); + if (ret < 0) { + error_report("kvm: Failed to enable triple fault event cap: %s", + strerror(-ret)); + return ret; + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; @@ -4131,6 +4142,11 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) } } + if (has_triple_fault_event) { + events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + events.triple_fault.pending = env->triple_fault_pending; + } + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); } @@ -4200,6 +4216,10 @@ static int kvm_get_vcpu_events(X86CPU *cpu) } } + if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + env->triple_fault_pending = events.triple_fault.pending; + } + env->sipi_vector = events.sipi_vector; return 0; diff --git a/target/i386/machine.c b/target/i386/machine.c index 8aa54432d7..7fe80e1aa0 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1518,6 +1518,25 @@ static const VMStateDescription vmstate_msr_ghcb_gpa = { }; #endif +static bool triple_fault_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return env->triple_fault_pending; +} + +static const VMStateDescription vmstate_triple_fault = { + .name = "cpu/triple_fault", + .version_id = 1, + .minimum_version_id = 1, + .needed = triple_fault_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8(env.triple_fault_pending, X86CPU), + VMSTATE_END_OF_LIST() + } +}; + const VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, @@ -1663,6 +1682,7 @@ const VMStateDescription vmstate_x86_cpu = { #if defined(CONFIG_KVM) && defined(TARGET_X86_64) &vmstate_msr_ghcb_gpa, #endif + &vmstate_triple_fault, NULL } }; -- Gitee From 805bf34df0e8c96f2e48efa5b9f0ebf3b0693883 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Sep 2022 15:20:12 +0800 Subject: [PATCH 03/23] kvm: allow target-specific accelerator properties commit 3dba0a335cf5c53146b606be6ddfab4df81c464e upstream. Several hypervisor capabilities in KVM are target-specific. When exposed to QEMU users as accelerator properties (i.e. -accel kvm,prop=value), they should not be available for all targets. Add a hook for targets to add their own properties to -accel kvm, for now no such property is defined. Intel-SIG: commit 3dba0a335cf5 ("kvm: allow target-specific accelerator properties"). Backport notify-vm-exit for qemu v6.2.0 Signed-off-by: Paolo Bonzini Message-Id: <20220929072014.20705-3-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini [ jason: remove changes in target/riscv/kvm.c since riscv kvm is not supported in qemu-6.2.0, meanwhile add loongarch64 support ] Signed-off-by: Jason Zeng --- accel/kvm/kvm-all.c | 2 ++ include/sysemu/kvm.h | 2 ++ target/arm/kvm.c | 4 ++++ target/i386/kvm/kvm.c | 4 ++++ target/loongarch64/kvm.c | 4 ++++ target/mips/kvm.c | 4 ++++ target/ppc/kvm.c | 4 ++++ target/s390x/kvm/kvm.c | 4 ++++ 8 files changed, 28 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 10af4170d9..573b2ba382 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -3759,6 +3759,8 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "dirty-ring-size", "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); + + kvm_arch_accel_class_init(oc); } static const TypeInfo kvm_accel_type = { diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index f050dfa1f7..b66970e07b 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -343,6 +343,8 @@ bool kvm_device_supported(int vmfd, uint64_t type); extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; +void kvm_arch_accel_class_init(ObjectClass *oc); + void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run); MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run); diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 1ae4e51055..7bec3e5f4f 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1059,3 +1059,7 @@ bool kvm_arch_cpu_check_are_resettable(void) { return true; } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 9541a1aed7..bc7dc65bba 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5443,3 +5443,7 @@ void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) mask &= ~BIT_ULL(bit); } } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/loongarch64/kvm.c b/target/loongarch64/kvm.c index 0eaabe3943..8d2f84995e 100644 --- a/target/loongarch64/kvm.c +++ b/target/loongarch64/kvm.c @@ -1417,3 +1417,7 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/mips/kvm.c b/target/mips/kvm.c index 086debd9f0..f80ac72dd1 100644 --- a/target/mips/kvm.c +++ b/target/mips/kvm.c @@ -1295,3 +1295,7 @@ bool kvm_arch_cpu_check_are_resettable(void) { return true; } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index dc93b99189..da2dcea1e2 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -2959,3 +2959,7 @@ bool kvm_arch_cpu_check_are_resettable(void) { return true; } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c index 6d1a6324b9..dbb911c832 100644 --- a/target/s390x/kvm/kvm.c +++ b/target/s390x/kvm/kvm.c @@ -2677,3 +2677,7 @@ int kvm_s390_get_zpci_op(void) { return cap_zpci_op; } + +void kvm_arch_accel_class_init(ObjectClass *oc) +{ +} -- Gitee From 13530f0db6581d89fe085ef294555ae5454d8dd2 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 29 Sep 2022 15:20:13 +0800 Subject: [PATCH 04/23] kvm: expose struct KVMState commit 5f8a6bce1f1080058ed29d716cae81ea805142ae upstream. Expose struct KVMState out of kvm-all.c so that the field of struct KVMState can be accessed when defining target-specific accelerator properties. Intel-SIG: commit 5f8a6bce1f10 ("kvm: expose struct KVMState"). Backport notify-vm-exit for qemu v6.2.0 Signed-off-by: Chenyi Qiang Message-Id: <20220929072014.20705-4-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini [ jason: amend commit message ] Signed-off-by: Jason Zeng --- accel/kvm/kvm-all.c | 74 -------------------------------------- include/sysemu/kvm_int.h | 76 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 74 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 573b2ba382..e8faa95059 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -73,86 +73,12 @@ do { } while (0) #endif -#define KVM_MSI_HASHTAB_SIZE 256 - struct KVMParkedVcpu { unsigned long vcpu_id; int kvm_fd; QLIST_ENTRY(KVMParkedVcpu) node; }; -enum KVMDirtyRingReaperState { - KVM_DIRTY_RING_REAPER_NONE = 0, - /* The reaper is sleeping */ - KVM_DIRTY_RING_REAPER_WAIT, - /* The reaper is reaping for dirty pages */ - KVM_DIRTY_RING_REAPER_REAPING, -}; - -/* - * KVM reaper instance, responsible for collecting the KVM dirty bits - * via the dirty ring. - */ -struct KVMDirtyRingReaper { - /* The reaper thread */ - QemuThread reaper_thr; - volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ - volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ -}; - -struct KVMState -{ - AccelState parent_obj; - - int nr_slots; - int fd; - int vmfd; - int coalesced_mmio; - int coalesced_pio; - struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; - bool coalesced_flush_in_progress; - int vcpu_events; - int robust_singlestep; - int debugregs; -#ifdef KVM_CAP_SET_GUEST_DEBUG - QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; -#endif - int max_nested_state_len; - int many_ioeventfds; - int intx_set_mask; - int kvm_shadow_mem; - bool kernel_irqchip_allowed; - bool kernel_irqchip_required; - OnOffAuto kernel_irqchip_split; - bool sync_mmu; - uint64_t manual_dirty_log_protect; - /* The man page (and posix) say ioctl numbers are signed int, but - * they're not. Linux, glibc and *BSD all treat ioctl numbers as - * unsigned, and treating them as signed here can break things */ - unsigned irq_set_ioctl; - unsigned int sigmask_len; - GHashTable *gsimap; -#ifdef KVM_CAP_IRQ_ROUTING - struct kvm_irq_routing *irq_routes; - int nr_allocated_irq_routes; - unsigned long *used_gsi_bitmap; - unsigned int gsi_count; - QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; -#endif - KVMMemoryListener memory_listener; - QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; - - /* For "info mtree -f" to tell if an MR is registered in KVM */ - int nr_as; - struct KVMAs { - KVMMemoryListener *ml; - AddressSpace *as; - } *as; - uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ - uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ - struct KVMDirtyRingReaper reaper; -}; - KVMState *kvm_state; bool kvm_kernel_irqchip; bool kvm_split_irqchip; diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index 7e18c0a3c0..60b520a13e 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -10,6 +10,7 @@ #define QEMU_KVM_INT_H #include "exec/memory.h" +#include "qapi/qapi-types-common.h" #include "qemu/accel.h" #include "qemu/queue.h" #include "sysemu/kvm.h" @@ -44,6 +45,81 @@ typedef struct KVMMemoryListener { QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; } KVMMemoryListener; +#define KVM_MSI_HASHTAB_SIZE 256 + +enum KVMDirtyRingReaperState { + KVM_DIRTY_RING_REAPER_NONE = 0, + /* The reaper is sleeping */ + KVM_DIRTY_RING_REAPER_WAIT, + /* The reaper is reaping for dirty pages */ + KVM_DIRTY_RING_REAPER_REAPING, +}; + +/* + * KVM reaper instance, responsible for collecting the KVM dirty bits + * via the dirty ring. + */ +struct KVMDirtyRingReaper { + /* The reaper thread */ + QemuThread reaper_thr; + volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ + volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ +}; +struct KVMState +{ + AccelState parent_obj; + + int nr_slots; + int fd; + int vmfd; + int coalesced_mmio; + int coalesced_pio; + struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; + bool coalesced_flush_in_progress; + int vcpu_events; + int robust_singlestep; + int debugregs; +#ifdef KVM_CAP_SET_GUEST_DEBUG + QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; +#endif + int max_nested_state_len; + int many_ioeventfds; + int intx_set_mask; + int kvm_shadow_mem; + bool kernel_irqchip_allowed; + bool kernel_irqchip_required; + OnOffAuto kernel_irqchip_split; + bool sync_mmu; + uint64_t manual_dirty_log_protect; + /* The man page (and posix) say ioctl numbers are signed int, but + * they're not. Linux, glibc and *BSD all treat ioctl numbers as + * unsigned, and treating them as signed here can break things */ + unsigned irq_set_ioctl; + unsigned int sigmask_len; + GHashTable *gsimap; +#ifdef KVM_CAP_IRQ_ROUTING + struct kvm_irq_routing *irq_routes; + int nr_allocated_irq_routes; + unsigned long *used_gsi_bitmap; + unsigned int gsi_count; + QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; +#endif + KVMMemoryListener memory_listener; + QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; + + /* For "info mtree -f" to tell if an MR is registered in KVM */ + int nr_as; + struct KVMAs { + KVMMemoryListener *ml; + AddressSpace *as; + } *as; + uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ + uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ + struct KVMDirtyRingReaper reaper; + NotifyVmexitOption notify_vmexit; + uint32_t notify_window; +}; + void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, AddressSpace *as, int as_id, const char *name); -- Gitee From e1401b07e5448076d18d750a4d2b656be0ba5c3c Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 29 Sep 2022 15:20:14 +0800 Subject: [PATCH 05/23] i386: add notify VM exit support commit e2e69f6bb907a70ac518230c54e98e7abcb0c911 upstream. There are cases that malicious virtual machine can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. Notify VM exit is introduced to mitigate such kind of attacks, which will generate a VM exit if no event window occurs in VM non-root mode for a specified amount of time (notify window). A new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT is exposed to user space so that the user can query the capability and set the expected notify window when creating VMs. The format of the argument when enabling this capability is as follows: Bit 63:32 - notify window specified in qemu command Bit 31:0 - some flags (e.g. KVM_X86_NOTIFY_VMEXIT_ENABLED is set to enable the feature.) Users can configure the feature by a new (x86 only) accel property: qemu -accel kvm,notify-vmexit=run|internal-error|disable,notify-window=n The default option of notify-vmexit is run, which will enable the capability and do nothing if the exit happens. The internal-error option raises a KVM internal error if it happens. The disable option does not enable the capability. The default value of notify-window is 0. It is valid only when notify-vmexit is not disabled. The valid range of notify-window is non-negative. It is even safe to set it to zero since there's an internal hardware threshold to be added to ensure no false positive. Because a notify VM exit may happen with VM_CONTEXT_INVALID set in exit qualification (no cases are anticipated that would set this bit), which means VM context is corrupted. It would be reflected in the flags of KVM_EXIT_NOTIFY exit. If KVM_NOTIFY_CONTEXT_INVALID bit is set, raise a KVM internal error unconditionally. Intel-SIG: commit e2e69f6bb907 ("i386: add notify VM exit support"). Backport notify-vm-exit for qemu v6.2.0 Acked-by: Peter Xu Signed-off-by: Chenyi Qiang Message-Id: <20220929072014.20705-5-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini [ jason: amend commit message ] Signed-off-by: Jason Zeng --- accel/kvm/kvm-all.c | 2 + qapi/run-state.json | 17 ++++++++ qemu-options.hx | 11 +++++ target/i386/kvm/kvm.c | 98 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e8faa95059..328fccd7fa 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -3658,6 +3658,8 @@ static void kvm_accel_instance_init(Object *obj) s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; /* KVM dirty ring is by default off */ s->kvm_dirty_ring_size = 0; + s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN; + s->notify_window = 0; } static void kvm_accel_class_init(ObjectClass *oc, void *data) diff --git a/qapi/run-state.json b/qapi/run-state.json index 43d66d700f..08c38b2c67 100644 --- a/qapi/run-state.json +++ b/qapi/run-state.json @@ -638,3 +638,20 @@ { 'struct': 'MemoryFailureFlags', 'data': { 'action-required': 'bool', 'recursive': 'bool'} } + +## +# @NotifyVmexitOption: +# +# An enumeration of the options specified when enabling notify VM exit +# +# @run: enable the feature, do nothing and continue if the notify VM exit happens. +# +# @internal-error: enable the feature, raise a internal error if the notify +# VM exit happens. +# +# @disable: disable the feature. +# +# Since: 7.2 +## +{ 'enum': 'NotifyVmexitOption', + 'data': [ 'run', 'internal-error', 'disable' ] } \ No newline at end of file diff --git a/qemu-options.hx b/qemu-options.hx index 453d220226..8ec647371c 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -152,6 +152,7 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel, " split-wx=on|off (enable TCG split w^x mapping)\n" " tb-size=n (TCG translation block cache size)\n" " dirty-ring-size=n (KVM dirty ring GFN count, default 0)\n" + " notify-vmexit=run|internal-error|disable,notify-window=n (enable notify VM exit and set notify window, x86 only)\n" " thread=single|multi (enable multi-threaded TCG)\n", QEMU_ARCH_ALL) SRST ``-accel name[,prop=value[,...]]`` @@ -203,6 +204,16 @@ SRST is disabled (dirty-ring-size=0). When enabled, KVM will instead record dirty pages in a bitmap. + ``notify-vmexit=run|internal-error|disable,notify-window=n`` + Enables or disables notify VM exit support on x86 host and specify + the corresponding notify window to trigger the VM exit if enabled. + ``run`` option enables the feature. It does nothing and continue + if the exit happens. ``internal-error`` option enables the feature. + It raises a internal error. ``disable`` option doesn't enable the feature. + This feature can mitigate the CPU stuck issue due to event windows don't + open up for a specified of time (i.e. notify-window). + Default: notify-vmexit=run,notify-window=0. + ERST DEF("smp", HAS_ARG, QEMU_OPTION_smp, diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index bc7dc65bba..0e2c37ae8d 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -15,6 +15,7 @@ #include "qemu/osdep.h" #include "qapi/qapi-events-run-state.h" #include "qapi/error.h" +#include "qapi/visitor.h" #include #include #include @@ -2608,6 +2609,21 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE && + kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { + uint64_t notify_window_flags = + ((uint64_t)s->notify_window << 32) | + KVM_X86_NOTIFY_VMEXIT_ENABLED | + KVM_X86_NOTIFY_VMEXIT_USER; + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, + notify_window_flags); + if (ret < 0) { + error_report("kvm: Failed to enable notify vmexit cap: %s", + strerror(-ret)); + return ret; + } + } + return 0; } @@ -5099,6 +5115,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) X86CPU *cpu = X86_CPU(cs); uint64_t code; int ret; + bool ctx_invalid; + char str[256]; + KVMState *state; switch (run->exit_reason) { case KVM_EXIT_HLT: @@ -5167,6 +5186,21 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) case KVM_EXIT_HYPERCALL: ret = kvm_handle_exit_hypercall(cpu, run); break; + case KVM_EXIT_NOTIFY: + ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID); + state = KVM_STATE(current_accel()); + sprintf(str, "Encounter a notify exit with %svalid context in" + " guest. There can be possible misbehaves in guest." + " Please have a look.", ctx_invalid ? "in" : ""); + if (ctx_invalid || + state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) { + warn_report("KVM internal error: %s", str); + ret = -1; + } else { + warn_report_once("KVM: %s", str); + ret = 0; + } + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; @@ -5444,6 +5478,70 @@ void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) } } +static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp) +{ + KVMState *s = KVM_STATE(obj); + return s->notify_vmexit; +} + +static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp) +{ + KVMState *s = KVM_STATE(obj); + + if (s->fd != -1) { + error_setg(errp, "Cannot set properties after the accelerator has been initialized"); + return; + } + + s->notify_vmexit = value; +} + +static void kvm_arch_get_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + uint32_t value = s->notify_window; + + visit_type_uint32(v, name, &value, errp); +} + +static void kvm_arch_set_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + Error *error = NULL; + uint32_t value; + + if (s->fd != -1) { + error_setg(errp, "Cannot set properties after the accelerator has been initialized"); + return; + } + + visit_type_uint32(v, name, &value, &error); + if (error) { + error_propagate(errp, error); + return; + } + + s->notify_window = value; +} + void kvm_arch_accel_class_init(ObjectClass *oc) { + object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption", + &NotifyVmexitOption_lookup, + kvm_arch_get_notify_vmexit, + kvm_arch_set_notify_vmexit); + object_class_property_set_description(oc, "notify-vmexit", + "Enable notify VM exit"); + + object_class_property_add(oc, "notify-window", "uint32", + kvm_arch_get_notify_window, + kvm_arch_set_notify_window, + NULL, NULL); + object_class_property_set_description(oc, "notify-window", + "Clock cycles without an event window " + "after which a notification VM exit occurs"); } -- Gitee From 79efe4b84a3657c9a08bfb545d923c82bba1e908 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 14 Mar 2022 14:25:41 +0000 Subject: [PATCH 06/23] target/i386: Fix sanity check on max APIC ID / X2APIC enablement commit dc89f32d92bba795b0665f075b78d8881cf67ab3 upstream. The check on x86ms->apic_id_limit in pc_machine_done() had two problems. Firstly, we need KVM to support the X2APIC API in order to allow IRQ delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(), which was done elsewhere in *some* cases but not all. Secondly, microvm needs the same check. So move it from pc_machine_done() to x86_cpus_init() where it will work for both. The check in kvm_cpu_instance_init() is now redundant and can be dropped. Intel-SIG: commit dc89f32d92bb ("target/i386: Fix sanity check on max APIC ID / X2APIC enablement"). Backport bugfix of max APIC ID / X2APIC for qemu-6.2.0 Signed-off-by: David Woodhouse Acked-by: Claudio Fontana Message-Id: <20220314142544.150555-1-dwmw2@infradead.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin [ jason: amend commit log ] Signed-off-by: Jason Zeng --- hw/i386/pc.c | 8 -------- hw/i386/x86.c | 16 ++++++++++++++++ target/i386/kvm/kvm-cpu.c | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 357257349b..5b926bed11 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1035,14 +1035,6 @@ void pc_machine_done(Notifier *notifier, void *data) /* update FW_CFG_NB_CPUS to account for -device added CPUs */ fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); } - - - if (x86ms->apic_id_limit > 255 && !xen_enabled() && - !kvm_irqchip_in_kernel()) { - error_report("current -smp configuration requires kernel " - "irqchip support."); - exit(EXIT_FAILURE); - } } void pc_guest_info_init(PCMachineState *pcms) diff --git a/hw/i386/x86.c b/hw/i386/x86.c index b84840a1bb..f64639b873 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -39,6 +39,7 @@ #include "sysemu/replay.h" #include "sysemu/sysemu.h" #include "sysemu/cpu-timers.h" +#include "sysemu/xen.h" #include "trace.h" #include "hw/i386/x86.h" @@ -136,6 +137,21 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) */ x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, ms->smp.max_cpus - 1) + 1; + + /* + * Can we support APIC ID 255 or higher? + * + * Under Xen: yes. + * With userspace emulated lapic: no + * With KVM's in-kernel lapic: only if X2APIC API is enabled. + */ + if (x86ms->apic_id_limit > 255 && !xen_enabled() && + (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) { + error_report("current -smp configuration requires kernel " + "irqchip and X2APIC API support."); + exit(EXIT_FAILURE); + } + possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < ms->smp.cpus; i++) { x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c index 74c1396a93..7b8a3d5af0 100644 --- a/target/i386/kvm/kvm-cpu.c +++ b/target/i386/kvm/kvm-cpu.c @@ -172,7 +172,7 @@ static void kvm_cpu_instance_init(CPUState *cs) /* only applies to builtin_x86_defs cpus */ if (!kvm_irqchip_in_kernel()) { x86_cpu_change_kvm_default("x2apic", "off"); - } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) { + } else if (kvm_irqchip_is_split()) { x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on"); } -- Gitee From 092069704faf30f4ebd0f695b4e2c3837a266315 Mon Sep 17 00:00:00 2001 From: Zeng Guang Date: Thu, 25 Aug 2022 10:52:46 +0800 Subject: [PATCH 07/23] target/i386: Set maximum APIC ID to KVM prior to vCPU creation commit 19e2a9fb9da067acba95b3be83588bda5a3f6a99 upstream. Specify maximum possible APIC ID assigned for current VM session to KVM prior to the creation of vCPUs. By this setting, KVM can set up VM-scoped data structure indexed by the APIC ID, e.g. Posted-Interrupt Descriptor pointer table to support Intel IPI virtualization, with the most optimal memory footprint. It can be achieved by calling KVM_ENABLE_CAP for KVM_CAP_MAX_VCPU_ID capability once KVM has enabled it. Ignoring the return error if KVM doesn't support this capability yet. Intel-SIG: commit 19e2a9fb9da0 ("target/i386: Set maximum APIC ID to KVM prior to vCPU creation"). Backport setting maximum APIC ID for qemu-6.2.0 Signed-off-by: Zeng Guang Acked-by: Peter Xu Acked-by: Michael S. Tsirkin Message-Id: <20220825025246.26618-1-guang.zeng@intel.com> Signed-off-by: Paolo Bonzini [ jason: amend commit log ] Signed-off-by: Jason Zeng --- hw/i386/x86.c | 4 ++++ target/i386/kvm/kvm-stub.c | 5 +++++ target/i386/kvm/kvm.c | 5 +++++ target/i386/kvm/kvm_i386.h | 2 ++ 4 files changed, 16 insertions(+) diff --git a/hw/i386/x86.c b/hw/i386/x86.c index f64639b873..a3258d78fa 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -152,6 +152,10 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) exit(EXIT_FAILURE); } + if (kvm_enabled()) { + kvm_set_max_apic_id(x86ms->apic_id_limit); + } + possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < ms->smp.cpus; i++) { x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); diff --git a/target/i386/kvm/kvm-stub.c b/target/i386/kvm/kvm-stub.c index f6e7e4466e..e052f1c7b0 100644 --- a/target/i386/kvm/kvm-stub.c +++ b/target/i386/kvm/kvm-stub.c @@ -44,3 +44,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) { abort(); } + +void kvm_set_max_apic_id(uint32_t max_apic_id) +{ + return; +} diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 0e2c37ae8d..390b26201e 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5545,3 +5545,8 @@ void kvm_arch_accel_class_init(ObjectClass *oc) "Clock cycles without an event window " "after which a notification VM exit occurs"); } + +void kvm_set_max_apic_id(uint32_t max_apic_id) +{ + kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id); +} diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index 2ed586c11b..23f6cab600 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -65,4 +65,6 @@ typedef struct kvm_msr_handlers { bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, QEMUWRMSRHandler *wrmsr); +void kvm_set_max_apic_id(uint32_t max_apic_id); + #endif -- Gitee From e253676666b84e20eb32d8c0eceb80fcff51f1b7 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 9 Nov 2022 15:48:34 +1300 Subject: [PATCH 08/23] target/i386: Add SGX aex-notify and EDECCSSA support commit d45f24fe7525d8a8aaa4ca6d9d214dc41819caa5 upstream. The new SGX Asynchronous Exit (AEX) notification mechanism (AEX-notify) allows one enclave to receive a notification in the ERESUME after the enclave exit due to an AEX. EDECCSSA is a new SGX user leaf function (ENCLU[EDECCSSA]) to facilitate the AEX notification handling. Whether the hardware supports to create enclave with AEX-notify support is enumerated via CPUID.(EAX=0x12,ECX=0x1):EAX[10]. The new EDECCSSA user leaf function is enumerated via CPUID.(EAX=0x12,ECX=0x0):EAX[11]. Add support to allow to expose the new SGX AEX-notify feature and the new EDECCSSA user leaf function to KVM guest. Intel-SIG: commit d45f24fe7525 ("target/i386: Add SGX aex-notify and EDECCSSA support"). Backport SGX aex_notify and EDECCSSA support Link: https://lore.kernel.org/lkml/166760360549.4906.809756297092548496.tip-bot2@tip-bot2/ Link: https://lore.kernel.org/lkml/166760360934.4906.2427175408052308969.tip-bot2@tip-bot2/ Reviewed-by: Yang Zhong Signed-off-by: Kai Huang Message-Id: <20221109024834.172705-1-kai.huang@intel.com> Signed-off-by: Paolo Bonzini [ jason: amend commit log ] Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 67b3593285..21300c2d19 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1246,7 +1246,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .feat_names = { "sgx1", "sgx2", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "sgx-edeccssa", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1286,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .feat_names = { NULL, "sgx-debug", "sgx-mode64", NULL, "sgx-provisionkey", "sgx-tokenkey", NULL, "sgx-kss", - NULL, NULL, NULL, NULL, + NULL, NULL, "sgx-aex-notify", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -- Gitee From 8397ac179c57586c6b614fcaafc64488f98c528c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 27 Feb 2023 10:41:46 +0100 Subject: [PATCH 09/23] target/i386: KVM: allow fast string operations if host supports them commit 3023c9b4d1092eb27a523c08d9e78cbaec67b59b upstream. These are just a flag that documents the performance characteristic of an instruction; it needs no hypervisor support. So include them even if KVM does not show them. In particular, FZRM/FSRS/FSRC have only been added very recently, but they are available on Sapphire Rapids processors. Intel-SIG: commit 3023c9b4d109 target/i386: KVM: allow fast string operations if host supports them Backport to add new CPU model SapphireRapids and new fast string op leaves. Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- target/i386/kvm/kvm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 390b26201e..bf9fe77843 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -354,7 +354,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, { struct kvm_cpuid2 *cpuid; uint32_t ret = 0; - uint32_t cpuid_1_edx; + uint32_t cpuid_1_edx, unused; uint64_t bitmask; cpuid = get_supported_cpuid(s); @@ -401,10 +401,20 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, } else if (function == 6 && reg == R_EAX) { ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ } else if (function == 7 && index == 0 && reg == R_EBX) { + /* Not new instructions, just an optimization. */ + uint32_t ebx; + host_cpuid(7, 0, &unused, &ebx, &unused, &unused); + ret |= ebx & CPUID_7_0_EBX_ERMS; + if (host_tsx_broken()) { ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE); } } else if (function == 7 && index == 0 && reg == R_EDX) { + /* Not new instructions, just an optimization. */ + uint32_t edx; + host_cpuid(7, 0, &unused, &unused, &unused, &edx); + ret |= edx & CPUID_7_0_EDX_FSRM; + /* * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is @@ -413,6 +423,11 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, if (!has_msr_arch_capabs) { ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; } + } else if (function == 7 && index == 1 && reg == R_EAX) { + /* Not new instructions, just an optimization. */ + uint32_t eax; + host_cpuid(7, 1, &eax, &unused, &unused, &unused); + ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC); } else if (function == 0xd && index == 0 && (reg == R_EAX || reg == R_EDX)) { /* -- Gitee From c9241ad1fcb40835ad526190ca4ffe7eea9976f7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Nov 2021 13:38:58 +0100 Subject: [PATCH 10/23] configure, meson: move AVX tests to meson commit 622753d2fb501509ab03c241d476815f378d4ba5 upstream. For consistency with other tests, --enable-avx2 and --enable-avx512f fail to compile on x86 systems if cpuid.h is not available. Intel-SIG: commit 622753d2fb50 configure, meson: move AVX tests to meson Backport AVX512 support for xbzrle_encode_buffer. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- configure | 103 ---------------------------------- meson.build | 50 ++++++++++++++++- meson_options.txt | 4 ++ scripts/meson-buildoptions.sh | 6 ++ 4 files changed, 58 insertions(+), 105 deletions(-) diff --git a/configure b/configure index 6f4057f888..571092a822 100755 --- a/configure +++ b/configure @@ -330,8 +330,6 @@ qom_cast_debug="yes" trace_backends="log" trace_file="trace" opengl="$default_feature" -cpuid_h="no" -avx2_opt="$default_feature" guest_agent="$default_feature" guest_agent_with_vss="no" guest_agent_ntddscsi="no" @@ -1063,14 +1061,6 @@ for opt do ;; --disable-tools) want_tools="no" ;; - --disable-avx2) avx2_opt="no" - ;; - --enable-avx2) avx2_opt="yes" - ;; - --disable-avx512f) avx512f_opt="no" - ;; - --enable-avx512f) avx512f_opt="yes" - ;; --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane) echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2 ;; @@ -1468,8 +1458,6 @@ cat << EOF tpm TPM support libssh ssh block device support numa libnuma support - avx2 AVX2 optimization support - avx512f AVX512F optimization support replication replication support opengl opengl support xfsctl xfsctl support @@ -2908,85 +2896,6 @@ else # "$safe_stack" = "" fi fi -######################################## -# check if cpuid.h is usable. - -cat > $TMPC << EOF -#include -int main(void) { - unsigned a, b, c, d; - int max = __get_cpuid_max(0, 0); - - if (max >= 1) { - __cpuid(1, a, b, c, d); - } - - if (max >= 7) { - __cpuid_count(7, 0, a, b, c, d); - } - - return 0; -} -EOF -if compile_prog "" "" ; then - cpuid_h=yes -fi - -########################################## -# avx2 optimization requirement check -# -# There is no point enabling this if cpuid.h is not usable, -# since we won't be able to select the new routines. - -if test "$cpuid_h" = "yes" && test "$avx2_opt" != "no"; then - cat > $TMPC << EOF -#pragma GCC push_options -#pragma GCC target("avx2") -#include -#include -static int bar(void *a) { - __m256i x = *(__m256i *)a; - return _mm256_testz_si256(x, x); -} -int main(int argc, char *argv[]) { return bar(argv[0]); } -EOF - if compile_object "-Werror" ; then - avx2_opt="yes" - else - avx2_opt="no" - fi -fi - -########################################## -# avx512f optimization requirement check -# -# There is no point enabling this if cpuid.h is not usable, -# since we won't be able to select the new routines. -# by default, it is turned off. -# if user explicitly want to enable it, check environment - -if test "$cpuid_h" = "yes" && test "$avx512f_opt" = "yes"; then - cat > $TMPC << EOF -#pragma GCC push_options -#pragma GCC target("avx512f") -#include -#include -static int bar(void *a) { - __m512i x = *(__m512i *)a; - return _mm512_test_epi64_mask(x, x); -} -int main(int argc, char *argv[]) -{ - return bar(argv[0]); -} -EOF - if ! compile_object "-Werror" ; then - avx512f_opt="no" - fi -else - avx512f_opt="no" -fi - ######################################## # check if __[u]int128_t is usable. @@ -3608,14 +3517,6 @@ if test "$opengl" = "yes" ; then echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak fi -if test "$avx2_opt" = "yes" ; then - echo "CONFIG_AVX2_OPT=y" >> $config_host_mak -fi - -if test "$avx512f_opt" = "yes" ; then - echo "CONFIG_AVX512F_OPT=y" >> $config_host_mak -fi - # XXX: suppress that if [ "$bsd" = "yes" ] ; then echo "CONFIG_BSD=y" >> $config_host_mak @@ -3648,10 +3549,6 @@ if test "$have_tsan" = "yes" && test "$have_tsan_iface_fiber" = "yes" ; then echo "CONFIG_TSAN=y" >> $config_host_mak fi -if test "$cpuid_h" = "yes" ; then - echo "CONFIG_CPUID_H=y" >> $config_host_mak -fi - if test "$int128" = "yes" ; then echo "CONFIG_INT128=y" >> $config_host_mak fi diff --git a/meson.build b/meson.build index 4d4fe04bd0..8f7eacde6a 100644 --- a/meson.build +++ b/meson.build @@ -1750,6 +1750,52 @@ config_host_data.set('CONFIG_GETAUXVAL', cc.links(gnu_source_prefix + ''' return getauxval(AT_HWCAP) == 0; }''')) +have_cpuid_h = cc.links(''' + #include + int main(void) { + unsigned a, b, c, d; + unsigned max = __get_cpuid_max(0, 0); + + if (max >= 1) { + __cpuid(1, a, b, c, d); + } + + if (max >= 7) { + __cpuid_count(7, 0, a, b, c, d); + } + + return 0; + }''') +config_host_data.set('CONFIG_CPUID_H', have_cpuid_h) + +config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \ + .require(cc.links(''' + #pragma GCC push_options + #pragma GCC target("avx2") + #include + #include + static int bar(void *a) { + __m256i x = *(__m256i *)a; + return _mm256_testz_si256(x, x); + } + int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX2 not available').allowed()) + +config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512F') \ + .require(cc.links(''' + #pragma GCC push_options + #pragma GCC target("avx512f") + #include + #include + static int bar(void *a) { + __m512i x = *(__m512i *)a; + return _mm512_test_epi64_mask(x, x); + } + int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512F not available').allowed()) + config_host_data.set('CONFIG_AF_VSOCK', cc.compiles(gnu_source_prefix + ''' #include #include @@ -3274,8 +3320,8 @@ summary_info += {'membarrier': config_host.has_key('CONFIG_MEMBARRIER')} summary_info += {'debug stack usage': config_host.has_key('CONFIG_DEBUG_STACK_USAGE')} summary_info += {'mutex debugging': config_host.has_key('CONFIG_DEBUG_MUTEX')} summary_info += {'memory allocator': get_option('malloc')} -summary_info += {'avx2 optimization': config_host.has_key('CONFIG_AVX2_OPT')} -summary_info += {'avx512f optimization': config_host.has_key('CONFIG_AVX512F_OPT')} +summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} +summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} summary_info += {'gprof enabled': config_host.has_key('CONFIG_GPROF')} summary_info += {'gcov': get_option('b_coverage')} summary_info += {'thread sanitizer': config_host.has_key('CONFIG_TSAN')} diff --git a/meson_options.txt b/meson_options.txt index e392323732..e9cbe48cb9 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -66,6 +66,10 @@ option('cfi_debug', type: 'boolean', value: 'false', description: 'Verbose errors in case of CFI violation') option('multiprocess', type: 'feature', value: 'auto', description: 'Out of process device emulation support') +option('avx2', type: 'feature', value: 'auto', + description: 'AVX2 optimizations') +option('avx512f', type: 'feature', value: 'disabled', + description: 'AVX512F optimizations') option('attr', type : 'feature', value : 'auto', description: 'attr/xattr support') diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 7a17ff4218..b994bf16f0 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -25,6 +25,8 @@ meson_options_help() { printf "%s\n" ' alsa ALSA sound support' printf "%s\n" ' attr attr/xattr support' printf "%s\n" ' auth-pam PAM access control' + printf "%s\n" ' avx2 AVX2 optimizations' + printf "%s\n" ' avx512f AVX512F optimizations' printf "%s\n" ' bpf eBPF support' printf "%s\n" ' brlapi brlapi character device driver' printf "%s\n" ' bzip2 bzip2 support for DMG images' @@ -107,6 +109,10 @@ _meson_option_parse() { --disable-attr) printf "%s" -Dattr=disabled ;; --enable-auth-pam) printf "%s" -Dauth_pam=enabled ;; --disable-auth-pam) printf "%s" -Dauth_pam=disabled ;; + --enable-avx2) printf "%s" -Davx2=enabled ;; + --disable-avx2) printf "%s" -Davx2=disabled ;; + --enable-avx512f) printf "%s" -Davx512f=enabled ;; + --disable-avx512f) printf "%s" -Davx512f=disabled ;; --enable-bpf) printf "%s" -Dbpf=enabled ;; --disable-bpf) printf "%s" -Dbpf=disabled ;; --enable-brlapi) printf "%s" -Dbrlapi=enabled ;; -- Gitee From eb80f86dbf6ebcb402298bc060af41cef87e4f61 Mon Sep 17 00:00:00 2001 From: ling xu Date: Wed, 16 Nov 2022 23:29:22 +0800 Subject: [PATCH 11/23] AVX512 support for xbzrle_encode_buffer commit 04ffce137b6d85ab4e7687e54e4dffcef0a9ab99 upstream. This commit is the same with [PATCH v6 1/2], and provides avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Intel-SIG: commit 04ffce137b6d AVX512 support for xbzrle_encode_buffer Backport AVX512 support for xbzrle_encode_buffer. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- meson.build | 17 +++++ meson_options.txt | 2 + migration/ram.c | 34 +++++++++- migration/xbzrle.c | 124 ++++++++++++++++++++++++++++++++++ migration/xbzrle.h | 4 ++ scripts/meson-buildoptions.sh | 3 + 6 files changed, 181 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 8f7eacde6a..869a61a5fb 100644 --- a/meson.build +++ b/meson.build @@ -1816,6 +1816,22 @@ config_host_data.set('CONFIG_AF_VSOCK', cc.compiles(gnu_source_prefix + ''' return -1; }''')) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' + #pragma GCC push_options + #pragma GCC target("avx512bw") + #include + #include + static int bar(void *a) { + + __m512i *x = a; + __m512i res= _mm512_abs_epi8(*x); + return res[1]; + } + int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + ignored = ['CONFIG_QEMU_INTERP_PREFIX', # actually per-target 'HAVE_GDB_BIN'] arrays = ['CONFIG_BDRV_RW_WHITELIST', 'CONFIG_BDRV_RO_WHITELIST'] @@ -3321,6 +3337,7 @@ summary_info += {'debug stack usage': config_host.has_key('CONFIG_DEBUG_STACK_US summary_info += {'mutex debugging': config_host.has_key('CONFIG_DEBUG_MUTEX')} summary_info += {'memory allocator': get_option('malloc')} summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} +summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')} summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} summary_info += {'gprof enabled': config_host.has_key('CONFIG_GPROF')} summary_info += {'gcov': get_option('b_coverage')} diff --git a/meson_options.txt b/meson_options.txt index e9cbe48cb9..ec9c3c0a05 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -70,6 +70,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('attr', type : 'feature', value : 'auto', description: 'attr/xattr support') diff --git a/migration/ram.c b/migration/ram.c index 727fe801db..efb6b15604 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -102,6 +102,34 @@ static inline bool is_zero_range(uint8_t *p, uint64_t size) return buffer_is_zero(p, size); } +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ + unsigned max = __get_cpuid_max(0, NULL); + int a, b, c, d; + if (max >= 1) { + __cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { + int bv; + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); + __cpuid_count(7, 0, a, b, c, d); + /* 0xe6: + * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 + * and ZMM16-ZMM31 state are enabled by OS) + * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) + */ + if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { + xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; + } + } + } +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -766,9 +794,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ - encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); + encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..05366e86c0 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) return d; } + +#if defined(CONFIG_AVX512BW_OPT) +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen) +{ + uint32_t zrun_len = 0, nzrun_len = 0; + int d = 0, i = 0, num = 0; + uint8_t *nzrun_start = NULL; + /* add 1 to include residual part in main loop */ + uint32_t count512s = (slen >> 6) + 1; + /* countResidual is tail of data, i.e., countResidual = slen % 64 */ + uint32_t count_residual = slen & 0b111111; + bool never_same = true; + uint64_t mask_residual = 1; + mask_residual <<= count_residual; + mask_residual -= 1; + __m512i r = _mm512_set1_epi32(0); + + while (count512s) { + if (d + 2 > dlen) { + return -1; + } + + int bytes_to_check = 64; + uint64_t mask = 0xffffffffffffffff; + if (count512s == 1) { + bytes_to_check = count_residual; + mask = mask_residual; + } + __m512i old_data = _mm512_mask_loadu_epi8(r, + mask, old_buf + i); + __m512i new_data = _mm512_mask_loadu_epi8(r, + mask, new_buf + i); + uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data); + count512s--; + + bool is_same = (comp & 0x1); + while (bytes_to_check) { + if (is_same) { + if (nzrun_len) { + d += uleb128_encode_small(dst + d, nzrun_len); + if (d + nzrun_len > dlen) { + return -1; + } + nzrun_start = new_buf + i - nzrun_len; + memcpy(dst + d, nzrun_start, nzrun_len); + d += nzrun_len; + nzrun_len = 0; + } + /* 64 data at a time for speed */ + if (count512s && (comp == 0xffffffffffffffff)) { + i += 64; + zrun_len += 64; + break; + } + never_same = false; + num = __builtin_ctzll(~comp); + num = (num < bytes_to_check) ? num : bytes_to_check; + zrun_len += num; + bytes_to_check -= num; + comp >>= num; + i += num; + if (bytes_to_check) { + /* still has different data after same data */ + d += uleb128_encode_small(dst + d, zrun_len); + zrun_len = 0; + } else { + break; + } + } + if (never_same || zrun_len) { + /* + * never_same only acts if + * data begins with diff in first count512s + */ + d += uleb128_encode_small(dst + d, zrun_len); + zrun_len = 0; + never_same = false; + } + /* has diff, 64 data at a time for speed */ + if ((bytes_to_check == 64) && (comp == 0x0)) { + i += 64; + nzrun_len += 64; + break; + } + num = __builtin_ctzll(comp); + num = (num < bytes_to_check) ? num : bytes_to_check; + nzrun_len += num; + bytes_to_check -= num; + comp >>= num; + i += num; + if (bytes_to_check) { + /* mask like 111000 */ + d += uleb128_encode_small(dst + d, nzrun_len); + /* overflow */ + if (d + nzrun_len > dlen) { + return -1; + } + nzrun_start = new_buf + i - nzrun_len; + memcpy(dst + d, nzrun_start, nzrun_len); + d += nzrun_len; + nzrun_len = 0; + is_same = true; + } + } + } + + if (nzrun_len != 0) { + d += uleb128_encode_small(dst + d, nzrun_len); + /* overflow */ + if (d + nzrun_len > dlen) { + return -1; + } + nzrun_start = new_buf + i - nzrun_len; + memcpy(dst + d, nzrun_start, nzrun_len); + d += nzrun_len; + } + return d; +} +#pragma GCC pop_options +#endif diff --git a/migration/xbzrle.h b/migration/xbzrle.h index a0db507b9c..6feb49160a 100644 --- a/migration/xbzrle.h +++ b/migration/xbzrle.h @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); +#if defined(CONFIG_AVX512BW_OPT) +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen); +#endif #endif diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index b994bf16f0..8c00cce411 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -26,6 +26,7 @@ meson_options_help() { printf "%s\n" ' attr attr/xattr support' printf "%s\n" ' auth-pam PAM access control' printf "%s\n" ' avx2 AVX2 optimizations' + printf "%s\n" ' avx512bw AVX512BW optimizations' printf "%s\n" ' avx512f AVX512F optimizations' printf "%s\n" ' bpf eBPF support' printf "%s\n" ' brlapi brlapi character device driver' @@ -111,6 +112,8 @@ _meson_option_parse() { --disable-auth-pam) printf "%s" -Dauth_pam=disabled ;; --enable-avx2) printf "%s" -Davx2=enabled ;; --disable-avx2) printf "%s" -Davx2=disabled ;; + --enable-avx512bw) printf "%s" -Davx512bw=enabled ;; + --disable-avx512bw) printf "%s" -Davx512bw=disabled ;; --enable-avx512f) printf "%s" -Davx512f=enabled ;; --disable-avx512f) printf "%s" -Davx512f=disabled ;; --enable-bpf) printf "%s" -Dbpf=enabled ;; -- Gitee From c6114ee5deb5f80582a7c9d2564dc5f9a0fb2876 Mon Sep 17 00:00:00 2001 From: ling xu Date: Wed, 16 Nov 2022 23:29:23 +0800 Subject: [PATCH 12/23] Update bench-code for addressing CI problem commit cc98c9fd5c17b8ab62ad91b183060d8f70b9d00d upstream. Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c for performance benchmarking. we have modified xbzrle-bench.c to address CI problem. Intel-SIG: commit cc98c9fd5c17 Update bench-code for addressing CI problem Backport AVX512 support for xbzrle_encode_buffer. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- tests/bench/meson.build | 6 + tests/bench/xbzrle-bench.c | 469 +++++++++++++++++++++++++++++++++++++ tests/unit/test-xbzrle.c | 39 ++- 3 files changed, 509 insertions(+), 5 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c diff --git a/tests/bench/meson.build b/tests/bench/meson.build index 00b3c209dc..54bc8938a8 100644 --- a/tests/bench/meson.build +++ b/tests/bench/meson.build @@ -3,6 +3,12 @@ qht_bench = executable('qht-bench', sources: 'qht-bench.c', dependencies: [qemuutil]) +if have_system +xbzrle_bench = executable('xbzrle-bench', + sources: 'xbzrle-bench.c', + dependencies: [qemuutil,migration]) +endif + executable('atomic_add-bench', sources: files('atomic_add-bench.c'), dependencies: [qemuutil], diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c new file mode 100644 index 0000000000..8848a3a32d --- /dev/null +++ b/tests/bench/xbzrle-bench.c @@ -0,0 +1,469 @@ +/* + * Xor Based Zero Run Length Encoding unit tests. + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "../migration/xbzrle.h" + +#if defined(CONFIG_AVX512BW_OPT) +#define XBZRLE_PAGE_SIZE 4096 +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ + unsigned max = __get_cpuid_max(0, NULL); + int a, b, c, d; + is_cpu_support_avx512bw = false; + if (max >= 1) { + __cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { + int bv; + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); + __cpuid_count(7, 0, a, b, c, d); + /* 0xe6: + * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 + * and ZMM16-ZMM31 state are enabled by OS) + * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) + */ + if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { + is_cpu_support_avx512bw = true; + } + } + } + return ; +} + +struct ResTime { + float t_raw; + float t_512; +}; + + +/* Function prototypes +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen); +*/ +static void encode_decode_zero(struct ResTime *res) +{ + uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); + int i = 0; + int dlen = 0, dlen512 = 0; + int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + + for (i = diff_len; i > 0; i--) { + buffer[1000 + i] = i; + buffer512[1000 + i] = i; + } + + buffer[1000 + diff_len + 3] = 103; + buffer[1000 + diff_len + 5] = 105; + + buffer512[1000 + diff_len + 3] = 103; + buffer512[1000 + diff_len + 5] = 105; + + /* encode zero page */ + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + g_assert(dlen == 0); + + t_start512 = clock(); + dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + g_assert(dlen512 == 0); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(buffer); + g_free(compressed); + g_free(buffer512); + g_free(compressed512); + +} + +static void test_encode_decode_zero_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_zero(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("Zero test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_unchanged(struct ResTime *res) +{ + uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); + int i = 0; + int dlen = 0, dlen512 = 0; + int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + + for (i = diff_len; i > 0; i--) { + test[1000 + i] = i + 4; + test512[1000 + i] = i + 4; + } + + test[1000 + diff_len + 3] = 107; + test[1000 + diff_len + 5] = 109; + + test512[1000 + diff_len + 3] = 107; + test512[1000 + diff_len + 5] = 109; + + /* test unchanged buffer */ + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + g_assert(dlen == 0); + + t_start512 = clock(); + dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + g_assert(dlen512 == 0); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(test); + g_free(compressed); + g_free(test512); + g_free(compressed512); + +} + +static void test_encode_decode_unchanged_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_unchanged(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("Unchanged test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_1_byte(struct ResTime *res) +{ + uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); + uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); + int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0; + uint8_t buf[2]; + uint8_t buf512[2]; + + test[XBZRLE_PAGE_SIZE - 1] = 1; + test512[XBZRLE_PAGE_SIZE - 1] = 1; + + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2)); + + rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE); + g_assert(rc == XBZRLE_PAGE_SIZE); + g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0); + + t_start512 = clock(); + dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2)); + + rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512, + XBZRLE_PAGE_SIZE); + g_assert(rc512 == XBZRLE_PAGE_SIZE); + g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(buffer); + g_free(compressed); + g_free(test); + g_free(buffer512); + g_free(compressed512); + g_free(test512); + +} + +static void test_encode_decode_1_byte_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_1_byte(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("1 byte test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_overflow(struct ResTime *res) +{ + uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); + int i = 0, rc = 0, rc512 = 0; + + for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) { + test[i * 2] = 1; + test512[i * 2] = 1; + } + + /* encode overflow */ + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + g_assert(rc == -1); + + t_start512 = clock(); + rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + g_assert(rc512 == -1); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(buffer); + g_free(compressed); + g_free(test); + g_free(buffer512); + g_free(compressed512); + g_free(test512); + +} + +static void test_encode_decode_overflow_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_overflow(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("Overflow test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_range_avx512(struct ResTime *res) +{ + uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); + uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); + uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); + int i = 0, rc = 0, rc512 = 0; + int dlen = 0, dlen512 = 0; + + int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + + for (i = diff_len; i > 0; i--) { + buffer[1000 + i] = i; + test[1000 + i] = i + 4; + buffer512[1000 + i] = i; + test512[1000 + i] = i + 4; + } + + buffer[1000 + diff_len + 3] = 103; + test[1000 + diff_len + 3] = 107; + + buffer[1000 + diff_len + 5] = 105; + test[1000 + diff_len + 5] = 109; + + buffer512[1000 + diff_len + 3] = 103; + test512[1000 + diff_len + 3] = 107; + + buffer512[1000 + diff_len + 5] = 105; + test512[1000 + diff_len + 5] = 109; + + /* test encode/decode */ + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); + g_assert(rc < XBZRLE_PAGE_SIZE); + g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0); + + t_start512 = clock(); + dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE); + g_assert(rc512 < XBZRLE_PAGE_SIZE); + g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(buffer); + g_free(compressed); + g_free(test); + g_free(buffer512); + g_free(compressed512); + g_free(test512); + +} + +static void test_encode_decode_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_range_avx512(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("Encode decode test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_random(struct ResTime *res) +{ + uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); + uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); + uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); + uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); + int i = 0, rc = 0, rc512 = 0; + int dlen = 0, dlen512 = 0; + + int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1); + /* store the index of diff */ + int dirty_index[diff_len]; + for (int j = 0; j < diff_len; j++) { + dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1); + } + for (i = diff_len - 1; i >= 0; i--) { + buffer[dirty_index[i]] = i; + test[dirty_index[i]] = i + 4; + buffer512[dirty_index[i]] = i; + test512[dirty_index[i]] = i + 4; + } + + time_t t_start, t_end, t_start512, t_end512; + t_start = clock(); + dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + t_end = clock(); + float time_val = difftime(t_end, t_start); + rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); + g_assert(rc < XBZRLE_PAGE_SIZE); + + t_start512 = clock(); + dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); + t_end512 = clock(); + float time_val512 = difftime(t_end512, t_start512); + rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE); + g_assert(rc512 < XBZRLE_PAGE_SIZE); + + res->t_raw = time_val; + res->t_512 = time_val512; + + g_free(buffer); + g_free(compressed); + g_free(test); + g_free(buffer512); + g_free(compressed512); + g_free(test512); + +} + +static void test_encode_decode_random_avx512(void) +{ + int i; + float time_raw = 0.0, time_512 = 0.0; + struct ResTime res; + for (i = 0; i < 10000; i++) { + encode_decode_random(&res); + time_raw += res.t_raw; + time_512 += res.t_512; + } + printf("Random test:\n"); + printf("Raw xbzrle_encode time is %f ms\n", time_raw); + printf("512 xbzrle_encode time is %f ms\n", time_512); +} +#endif + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + g_test_rand_int(); + #if defined(CONFIG_AVX512BW_OPT) + if (likely(is_cpu_support_avx512bw)) { + g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512); + g_test_add_func("/xbzrle/encode_decode_unchanged", + test_encode_decode_unchanged_avx512); + g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512); + g_test_add_func("/xbzrle/encode_decode_overflow", + test_encode_decode_overflow_avx512); + g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512); + g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512); + } + #endif + return g_test_run(); +} diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index 795d6f1cba..baa364b443 100644 --- a/tests/unit/test-xbzrle.c +++ b/tests/unit/test-xbzrle.c @@ -17,6 +17,35 @@ #define XBZRLE_PAGE_SIZE 4096 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ + unsigned max = __get_cpuid_max(0, NULL); + int a, b, c, d; + if (max >= 1) { + __cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { + int bv; + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); + __cpuid_count(7, 0, a, b, c, d); + /* 0xe6: + * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 + * and ZMM16-ZMM31 state are enabled by OS) + * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) + */ + if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { + xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; + } + } + } + return ; +} +#endif + static void test_uleb(void) { uint32_t i, val; @@ -55,7 +84,7 @@ static void test_encode_decode_zero(void) buffer[1000 + diff_len + 5] = 105; /* encode zero page */ - dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); g_assert(dlen == 0); @@ -79,7 +108,7 @@ static void test_encode_decode_unchanged(void) test[1000 + diff_len + 5] = 109; /* test unchanged buffer */ - dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, + dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); g_assert(dlen == 0); @@ -97,7 +126,7 @@ static void test_encode_decode_1_byte(void) test[XBZRLE_PAGE_SIZE - 1] = 1; - dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, + dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2)); @@ -122,7 +151,7 @@ static void test_encode_decode_overflow(void) } /* encode overflow */ - rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, + rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); g_assert(rc == -1); @@ -153,7 +182,7 @@ static void encode_decode_range(void) test[1000 + diff_len + 5] = 109; /* test encode/decode */ - dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, + dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); -- Gitee From ad986c204d60a4157418e5016e3176dc4154f130 Mon Sep 17 00:00:00 2001 From: Matheus Tavares Bernardino Date: Mon, 13 Mar 2023 15:58:19 -0300 Subject: [PATCH 13/23] migration/xbzrle: use ctz64 to avoid undefined result commit d84a78d15d3af9ff28ceec6906a4b101bd545b55 upstream. __builtin_ctzll() produces undefined results when the argument is 0. This can be seen through test-xbzrle, which produces the following warning: ../migration/xbzrle.c:265: runtime error: passing zero to ctz(), which is not a valid argument Replace __builtin_ctzll() with our ctz64() wrapper which properly handles 0. Intel-SIG: commit d84a78d15d3a migration/xbzrle: use ctz64 to avoid undefined result Backport AVX512 support for xbzrle_encode_buffer. Signed-off-by: Matheus Tavares Bernardino Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- migration/xbzrle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 05366e86c0..21b92d4eae 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -12,6 +12,7 @@ */ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/host-utils.h" #include "xbzrle.h" /* @@ -233,7 +234,7 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, break; } never_same = false; - num = __builtin_ctzll(~comp); + num = ctz64(~comp); num = (num < bytes_to_check) ? num : bytes_to_check; zrun_len += num; bytes_to_check -= num; @@ -262,7 +263,7 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, nzrun_len += 64; break; } - num = __builtin_ctzll(comp); + num = ctz64(comp); num = (num < bytes_to_check) ? num : bytes_to_check; nzrun_len += num; bytes_to_check -= num; -- Gitee From e0979863f168a6537eef98b781d1ff8320d4c891 Mon Sep 17 00:00:00 2001 From: Matheus Tavares Bernardino Date: Mon, 13 Mar 2023 15:58:20 -0300 Subject: [PATCH 14/23] migration/xbzrle: fix out-of-bounds write with axv512 commit 1776b70f55c75541e9cab3423650a59b085162a9 upstream. xbzrle_encode_buffer_avx512() checks for overflows too scarcely in its outer loop, causing out-of-bounds writes: $ ../configure --target-list=aarch64-softmmu --enable-sanitizers --enable-avx512bw $ make tests/unit/test-xbzrle && ./tests/unit/test-xbzrle ==5518==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62100000b100 at pc 0x561109a7714d bp 0x7ffed712a440 sp 0x7ffed712a430 WRITE of size 1 at 0x62100000b100 thread T0 #0 0x561109a7714c in uleb128_encode_small ../util/cutils.c:831 #1 0x561109b67f6a in xbzrle_encode_buffer_avx512 ../migration/xbzrle.c:275 #2 0x5611099a7428 in test_encode_decode_overflow ../tests/unit/test-xbzrle.c:153 #3 0x7fb2fb65a58d (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7a58d) #4 0x7fb2fb65a333 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7a333) #5 0x7fb2fb65aa79 in g_test_run_suite (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7aa79) #6 0x7fb2fb65aa94 in g_test_run (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7aa94) #7 0x5611099a3a23 in main ../tests/unit/test-xbzrle.c:218 #8 0x7fb2fa78c082 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x24082) #9 0x5611099a608d in _start (/qemu/build/tests/unit/test-xbzrle+0x28408d) 0x62100000b100 is located 0 bytes to the right of 4096-byte region [0x62100000a100,0x62100000b100) allocated by thread T0 here: #0 0x7fb2fb823a06 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cc:153 #1 0x7fb2fb637ef0 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x57ef0) Fix that by performing the overflow check in the inner loop, instead. Intel-SIG: commit 1776b70f55c7 migration/xbzrle: fix out-of-bounds write with axv512 Backport AVX512 support for xbzrle_encode_buffer. Signed-off-by: Matheus Tavares Bernardino Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- migration/xbzrle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 21b92d4eae..c6f8b20917 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -197,10 +197,6 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, __m512i r = _mm512_set1_epi32(0); while (count512s) { - if (d + 2 > dlen) { - return -1; - } - int bytes_to_check = 64; uint64_t mask = 0xffffffffffffffff; if (count512s == 1) { @@ -216,6 +212,9 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, bool is_same = (comp & 0x1); while (bytes_to_check) { + if (d + 2 > dlen) { + return -1; + } if (is_same) { if (nzrun_len) { d += uleb128_encode_small(dst + d, nzrun_len); -- Gitee From 183be62f106a91351b5f6a5c79a6a6e275d50d6f Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 14 Aug 2023 21:54:27 -0700 Subject: [PATCH 15/23] target/i386: Export GDS_NO bit to guests commit 3a2a1f97ea349745094e789e6b0768dbd92d0dcd upstream. Gather Data Sampling (GDS) is a side-channel attack using Gather instructions. Some Intel processors will set ARCH_CAP_GDS_NO bit in MSR IA32_ARCH_CAPABILITIES to report that they are not vulnerable to GDS. Make this bit available to guests. Intel-SIG: commit 3a2a1f97ea34 target/i386: Export GDS_NO bit to guests Backport to export GDS_NO bit to guests(CVE-2022-40982). Closes: https://lore.kernel.org/qemu-devel/CAMGffEmG6TNq0n3+4OJAgXc8J0OevY60KHZekXCBs3LoK9vehA@mail.gmail.com/ Reported-by: Jack Wang Signed-off-by: Pawan Gupta Tested-by: Jack Wang Tested-by: Daniel Sneddon Message-ID: Signed-off-by: Paolo Bonzini [ Aichun Shi: amend commit log ] Signed-off-by: Aichun Shi --- target/i386/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 21300c2d19..74a183ed9a 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1025,7 +1025,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no", NULL, "fb-clear", NULL, NULL, NULL, NULL, NULL, NULL, - "pbrsb-no", NULL, NULL, "rfds-no", + "pbrsb-no", NULL, "gds-no", "rfds-no", "rfds-clear", NULL, NULL, NULL, }, .msr = { -- Gitee From c8a8d9c180881709528bb0e7bdfc132ea983631d Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Tue, 15 Feb 2022 14:52:53 -0500 Subject: [PATCH 16/23] target/i386: Add kvm_get_one_msr helper commit 5a778a5f820fdd907b95e93560637a61f6ea3c71 upstream. When try to get one msr from KVM, I found there's no such kind of existing interface while kvm_put_one_msr() is there. So here comes the patch. It'll remove redundant preparation code before finally call KVM_GET_MSRS IOCTL. No functional change intended. Intel-SIG: commit 5a778a5f820f target/i386: Add kvm_get_one_msr helper Backport i386/cpu bugfixes Signed-off-by: Yang Weijiang Message-Id: <20220215195258.29149-4-weijiang.yang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 46 ++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index bf9fe77843..6a8e6641e8 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -142,6 +142,7 @@ static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES]; #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */ static RateLimit bus_lock_ratelimit_ctrl; +static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value); int kvm_has_pit_state2(void) { @@ -212,28 +213,21 @@ static int kvm_get_tsc(CPUState *cs) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; - struct { - struct kvm_msrs info; - struct kvm_msr_entry entries[1]; - } msr_data = {}; + uint64_t value; int ret; if (env->tsc_valid) { return 0; } - memset(&msr_data, 0, sizeof(msr_data)); - msr_data.info.nmsrs = 1; - msr_data.entries[0].index = MSR_IA32_TSC; env->tsc_valid = !runstate_is_running(); - ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); + ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value); if (ret < 0) { return ret; } - assert(ret == 1); - env->tsc = msr_data.entries[0].data; + env->tsc = value; return 0; } @@ -1529,21 +1523,14 @@ static int hyperv_init_vcpu(X86CPU *cpu) * the kernel doesn't support setting vp_index; assert that its value * is in sync */ - struct { - struct kvm_msrs info; - struct kvm_msr_entry entries[1]; - } msr_data = { - .info.nmsrs = 1, - .entries[0].index = HV_X64_MSR_VP_INDEX, - }; + uint64_t value; - ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data); + ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value); if (ret < 0) { return ret; } - assert(ret == 1); - if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) { + if (value != hyperv_vp_index(CPU(cpu))) { error_report("kernel's vp_index != QEMU's vp_index"); return -ENXIO; } @@ -2874,6 +2861,25 @@ static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value) return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); } +static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value) +{ + int ret; + struct { + struct kvm_msrs info; + struct kvm_msr_entry entries[1]; + } msr_data = { + .info.nmsrs = 1, + .entries[0].index = index, + }; + + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); + if (ret < 0) { + return ret; + } + assert(ret == 1); + *value = msr_data.entries[0].data; + return ret; +} void kvm_put_apicbase(X86CPU *cpu, uint64_t value) { int ret; -- Gitee From 5e567b089d97bf1f73f4e4c1680ce9e42d683a7a Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Tue, 15 Feb 2022 14:52:54 -0500 Subject: [PATCH 17/23] target/i386: Enable support for XSAVES based features commit 301e90675c3fed6cdc48682021a1ab42bc0e0d76 upstream. There're some new features, including Arch LBR, depending on XSAVES/XRSTORS support, the new instructions will save/restore data based on feature bits enabled in XCR0 | XSS. This patch adds the basic support for related CPUID enumeration and meanwhile changes the name from FEAT_XSAVE_COMP_{LO|HI} to FEAT_XSAVE_XCR0_{LO|HI} to differentiate clearly the feature bits in XCR0 and those in XSS. Intel-SIG: commit 301e90675c3f target/i386: Enable support for XSAVES based features Backport i386/cpu bugfixes Signed-off-by: Yang Weijiang Message-Id: <20220215195258.29149-5-weijiang.yang@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 104 +++++++++++++++++++++++++++++++++++----------- target/i386/cpu.h | 14 ++++++- 2 files changed, 92 insertions(+), 26 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 74a183ed9a..7f7143aa29 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -978,6 +978,34 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { }, .tcg_features = TCG_XSAVE_FEATURES, }, + [FEAT_XSAVE_XSS_LO] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid = { + .eax = 0xD, + .needs_ecx = true, + .ecx = 1, + .reg = R_ECX, + }, + }, + [FEAT_XSAVE_XSS_HI] = { + .type = CPUID_FEATURE_WORD, + .cpuid = { + .eax = 0xD, + .needs_ecx = true, + .ecx = 1, + .reg = R_EDX + }, + }, [FEAT_6_EAX] = { .type = CPUID_FEATURE_WORD, .feat_names = { @@ -993,7 +1021,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .cpuid = { .eax = 6, .reg = R_EAX, }, .tcg_features = TCG_6_EAX_FEATURES, }, - [FEAT_XSAVE_COMP_LO] = { + [FEAT_XSAVE_XCR0_LO] = { .type = CPUID_FEATURE_WORD, .cpuid = { .eax = 0xD, @@ -1006,7 +1034,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { XSTATE_OPMASK_MASK | XSTATE_ZMM_Hi256_MASK | XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK, }, - [FEAT_XSAVE_COMP_HI] = { + [FEAT_XSAVE_XCR0_HI] = { .type = CPUID_FEATURE_WORD, .cpuid = { .eax = 0xD, @@ -1431,6 +1459,9 @@ static const X86RegisterInfo32 x86_reg_info_32[CPU_NB_REGS32] = { }; #undef REGISTER +/* CPUID feature bits available in XSS */ +#define CPUID_XSTATE_XSS_MASK (0) + ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = { [XSTATE_FP_BIT] = { /* x87 FP state component is always enabled if XSAVE is supported */ @@ -1473,15 +1504,18 @@ ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = { }, }; -static uint32_t xsave_area_size(uint64_t mask) +static uint32_t xsave_area_size(uint64_t mask, bool compacted) { + uint64_t ret = x86_ext_save_areas[0].size; + const ExtSaveArea *esa; + uint32_t offset = 0; int i; - uint64_t ret = 0; - for (i = 0; i < ARRAY_SIZE(x86_ext_save_areas); i++) { - const ExtSaveArea *esa = &x86_ext_save_areas[i]; + for (i = 2; i < ARRAY_SIZE(x86_ext_save_areas); i++) { + esa = &x86_ext_save_areas[i]; if ((mask >> i) & 1) { - ret = MAX(ret, esa->offset + esa->size); + offset = compacted ? ret : esa->offset; + ret = MAX(ret, offset + esa->size); } } return ret; @@ -1492,10 +1526,10 @@ static inline bool accel_uses_host_cpuid(void) return kvm_enabled() || hvf_enabled(); } -static inline uint64_t x86_cpu_xsave_components(X86CPU *cpu) +static inline uint64_t x86_cpu_xsave_xcr0_components(X86CPU *cpu) { - return ((uint64_t)cpu->env.features[FEAT_XSAVE_COMP_HI]) << 32 | - cpu->env.features[FEAT_XSAVE_COMP_LO]; + return ((uint64_t)cpu->env.features[FEAT_XSAVE_XCR0_HI]) << 32 | + cpu->env.features[FEAT_XSAVE_XCR0_LO]; } /* Return name of 32-bit register, from a R_* constant */ @@ -1507,6 +1541,12 @@ static const char *get_register_name_32(unsigned int reg) return x86_reg_info_32[reg].name; } +static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) +{ + return ((uint64_t)cpu->env.features[FEAT_XSAVE_XSS_HI]) << 32 | + cpu->env.features[FEAT_XSAVE_XSS_LO]; +} + /* * Returns the set of feature flags that are supported and migratable by * QEMU, for a given FeatureWord. @@ -5206,8 +5246,8 @@ static const char *x86_cpu_feature_name(FeatureWord w, int bitnr) /* XSAVE components are automatically enabled by other features, * so return the original feature name instead */ - if (w == FEAT_XSAVE_COMP_LO || w == FEAT_XSAVE_COMP_HI) { - int comp = (w == FEAT_XSAVE_COMP_HI) ? bitnr + 32 : bitnr; + if (w == FEAT_XSAVE_XCR0_LO || w == FEAT_XSAVE_XCR0_HI) { + int comp = (w == FEAT_XSAVE_XCR0_HI) ? bitnr + 32 : bitnr; if (comp < ARRAY_SIZE(x86_ext_save_areas) && x86_ext_save_areas[comp].bits) { @@ -6077,25 +6117,36 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } if (count == 0) { - *ecx = xsave_area_size(x86_cpu_xsave_components(cpu)); - *eax = env->features[FEAT_XSAVE_COMP_LO]; - *edx = env->features[FEAT_XSAVE_COMP_HI]; + *ecx = xsave_area_size(x86_cpu_xsave_xcr0_components(cpu), false); + *eax = env->features[FEAT_XSAVE_XCR0_LO]; + *edx = env->features[FEAT_XSAVE_XCR0_HI]; /* * The initial value of xcr0 and ebx == 0, On host without kvm * commit 412a3c41(e.g., CentOS 6), the ebx's value always == 0 * even through guest update xcr0, this will crash some legacy guest * (e.g., CentOS 6), So set ebx == ecx to workaroud it. */ - *ebx = kvm_enabled() ? *ecx : xsave_area_size(env->xcr0); + *ebx = kvm_enabled() ? *ecx : xsave_area_size(env->xcr0, false); } else if (count == 1) { + uint64_t xstate = x86_cpu_xsave_xcr0_components(cpu) | + x86_cpu_xsave_xss_components(cpu); + *eax = env->features[FEAT_XSAVE]; + *ebx = xsave_area_size(xstate, true); + *ecx = env->features[FEAT_XSAVE_XSS_LO]; + *edx = env->features[FEAT_XSAVE_XSS_HI]; } else if (count < ARRAY_SIZE(x86_ext_save_areas)) { - if ((x86_cpu_xsave_components(cpu) >> count) & 1) { - const ExtSaveArea *esa = &x86_ext_save_areas[count]; + const ExtSaveArea *esa = &x86_ext_save_areas[count]; + + if (x86_cpu_xsave_xcr0_components(cpu) & (1ULL << count)) { *eax = esa->size; *ebx = esa->offset; *ecx = esa->ecx & (ESA_FEATURE_ALIGN64_MASK | ESA_FEATURE_XFD_MASK); + } else if (x86_cpu_xsave_xss_components(cpu) & (1ULL << count)) { + *eax = esa->size; + *ebx = 0; + *ecx = 1; } } break; @@ -6146,8 +6197,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } else { *eax &= env->features[FEAT_SGX_12_1_EAX]; *ebx &= 0; /* ebx reserve */ - *ecx &= env->features[FEAT_XSAVE_COMP_LO]; - *edx &= env->features[FEAT_XSAVE_COMP_HI]; + *ecx &= env->features[FEAT_XSAVE_XSS_LO]; + *edx &= env->features[FEAT_XSAVE_XSS_HI]; /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; @@ -6544,6 +6595,9 @@ static void x86_cpu_reset(DeviceState *dev) } for (i = 2; i < ARRAY_SIZE(x86_ext_save_areas); i++) { const ExtSaveArea *esa = &x86_ext_save_areas[i]; + if (!((1 << i) & CPUID_XSTATE_XCR0_MASK)) { + continue; + } if (env->features[esa->feature] & esa->bits) { xcr0 |= 1ull << i; } @@ -6661,8 +6715,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) static bool request_perm; if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { - env->features[FEAT_XSAVE_COMP_LO] = 0; - env->features[FEAT_XSAVE_COMP_HI] = 0; + env->features[FEAT_XSAVE_XCR0_LO] = 0; + env->features[FEAT_XSAVE_XCR0_HI] = 0; return; } @@ -6680,8 +6734,10 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) request_perm = true; } - env->features[FEAT_XSAVE_COMP_LO] = mask; - env->features[FEAT_XSAVE_COMP_HI] = mask >> 32; + env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; + env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; + env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; + env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; } /***** Steps involved on loading and filtering CPUID data diff --git a/target/i386/cpu.h b/target/i386/cpu.h index c415bb3db8..b0d79a1519 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -566,6 +566,14 @@ typedef enum X86Seg { #define ESA_FEATURE_XFD_MASK (1U << ESA_FEATURE_XFD_BIT) +/* CPUID feature bits available in XCR0 */ +#define CPUID_XSTATE_XCR0_MASK (XSTATE_FP_MASK | XSTATE_SSE_MASK | \ + XSTATE_YMM_MASK | XSTATE_BNDREGS_MASK | \ + XSTATE_BNDCSR_MASK | XSTATE_OPMASK_MASK | \ + XSTATE_ZMM_Hi256_MASK | \ + XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK | \ + XSTATE_XTILE_CFG_MASK | XSTATE_XTILE_DATA_MASK) + /* CPUID feature words */ typedef enum FeatureWord { FEAT_1_EDX, /* CPUID[1].EDX */ @@ -584,8 +592,8 @@ typedef enum FeatureWord { FEAT_SVM, /* CPUID[8000_000A].EDX */ FEAT_XSAVE, /* CPUID[EAX=0xd,ECX=1].EAX */ FEAT_6_EAX, /* CPUID[6].EAX */ - FEAT_XSAVE_COMP_LO, /* CPUID[EAX=0xd,ECX=0].EAX */ - FEAT_XSAVE_COMP_HI, /* CPUID[EAX=0xd,ECX=0].EDX */ + FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */ + FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */ FEAT_ARCH_CAPABILITIES, FEAT_CORE_CAPABILITY, FEAT_PERF_CAPABILITIES, @@ -602,6 +610,8 @@ typedef enum FeatureWord { FEAT_SGX_12_0_EAX, /* CPUID[EAX=0x12,ECX=0].EAX (SGX) */ FEAT_SGX_12_0_EBX, /* CPUID[EAX=0x12,ECX=0].EBX (SGX MISCSELECT[31:0]) */ FEAT_SGX_12_1_EAX, /* CPUID[EAX=0x12,ECX=1].EAX (SGX ATTRIBUTES[31:0]) */ + FEAT_XSAVE_XSS_LO, /* CPUID[EAX=0xd,ECX=1].ECX */ + FEAT_XSAVE_XSS_HI, /* CPUID[EAX=0xd,ECX=1].EDX */ FEAT_7_1_EDX, /* CPUID[EAX=7,ECX=1].EDX */ FEAT_7_2_EDX, /* CPUID[EAX=7,ECX=2].EDX */ FEATURE_WORDS, -- Gitee From 968758b43a02a6f1779087b0f770776ff20984c4 Mon Sep 17 00:00:00 2001 From: Yang Zhong Date: Thu, 6 Apr 2023 02:40:41 -0400 Subject: [PATCH 18/23] target/i386: Change wrong XFRM value in SGX CPUID leaf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 72497cff896fecf74306ed33626c30e43633cdd6 upstream. The previous patch wrongly replaced FEAT_XSAVE_XCR0_{LO|HI} with FEAT_XSAVE_XSS_{LO|HI} in CPUID(EAX=12,ECX=1):{ECX,EDX}. As a result, SGX enclaves only supported SSE and x87 feature (xfrm=0x3). Intel-SIG: commit 72497cff896f target/i386: Change wrong XFRM value in SGX CPUID leaf Backport i386/cpu bugfixes Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Yang Zhong Reviewed-by: Yang Weijiang Reviewed-by: Kai Huang Message-Id: <20230406064041.420039-1-yang.zhong@linux.intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7f7143aa29..ddd5e30294 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6197,8 +6197,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } else { *eax &= env->features[FEAT_SGX_12_1_EAX]; *ebx &= 0; /* ebx reserve */ - *ecx &= env->features[FEAT_XSAVE_XSS_LO]; - *edx &= env->features[FEAT_XSAVE_XSS_HI]; + *ecx &= env->features[FEAT_XSAVE_XCR0_LO]; + *edx &= env->features[FEAT_XSAVE_XCR0_HI]; /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; -- Gitee From 3a1d6fd46dbaebe30cbd10cad962503554130b85 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 15 Jan 2024 04:13:24 -0500 Subject: [PATCH 19/23] i386/cpu: Clear FEAT_XSAVE_XSS_LO/HI leafs when CPUID_EXT_XSAVE is not available commit 81f5cad3858f27623b1b14467926032d229b76cc upstream. Leaf FEAT_XSAVE_XSS_LO and FEAT_XSAVE_XSS_HI also need to be cleared when CPUID_EXT_XSAVE is not set. Intel-SIG: commit 81f5cad3858f i386/cpu: Clear FEAT_XSAVE_XSS_LO/HI leafs when CPUID_EXT_XSAVE is not available Backport i386/cpu bugfixes Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Xiaoyao Li Reviewed-by: Yang Weijiang Message-ID: <20240115091325.1904229-2-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ddd5e30294..23dbaccb83 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6717,6 +6717,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { env->features[FEAT_XSAVE_XCR0_LO] = 0; env->features[FEAT_XSAVE_XCR0_HI] = 0; + env->features[FEAT_XSAVE_XSS_LO] = 0; + env->features[FEAT_XSAVE_XSS_HI] = 0; return; } -- Gitee From ec9713351119395fb7b32e7a896eafdde39dcfaa Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 15 Jan 2024 04:13:25 -0500 Subject: [PATCH 20/23] i386/cpu: Mask with XCR0/XSS mask for FEAT_XSAVE_XCR0_HI and FEAT_XSAVE_XSS_HI leafs commit a11a365159b944e05be76f3ec3b98c8b38cb70fd upstream. The value of FEAT_XSAVE_XCR0_HI leaf and FEAT_XSAVE_XSS_HI leaf also need to be masked by XCR0 and XSS mask respectively, to make it logically correct. Intel-SIG: commit a11a365159b9 i386/cpu: Mask with XCR0/XSS mask for FEAT_XSAVE_XCR0_HI and FEAT_XSAVE_XSS_HI leafs Backport i36/cpu bugfixes Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") Signed-off-by: Xiaoyao Li Reviewed-by: Yang Weijiang Message-ID: <20240115091325.1904229-3-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 23dbaccb83..f151a5a403 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6737,9 +6737,9 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) } env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; - env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; + env->features[FEAT_XSAVE_XCR0_HI] = (mask & CPUID_XSTATE_XCR0_MASK) >> 32; env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; - env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; + env->features[FEAT_XSAVE_XSS_HI] = (mask & CPUID_XSTATE_XSS_MASK) >> 32; } /***** Steps involved on loading and filtering CPUID data -- Gitee From c87834b6ec23c9013b3fea80ae20b7680ee069d9 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 24 Jan 2024 21:40:14 -0500 Subject: [PATCH 21/23] i386/cpuid: Decrease cpuid_i when skipping CPUID leaf 1F commit 10f92799af8ba3c3cef2352adcd4780f13fbab31 upstream. Existing code misses a decrement of cpuid_i when skip leaf 0x1F. There's a blank CPUID entry(with leaf, subleaf as 0, and all fields stuffed 0s) left in the CPUID array. It conflicts with correct CPUID leaf 0. Intel-SIG: commit 10f92799af8b i386/cpuid: Decrease cpuid_i when skipping CPUID leaf 1F Backport i386/cpu bugfixes Signed-off-by: Xiaoyao Li Reviewed-by:Yang Weijiang Message-ID: <20240125024016.2521244-2-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6a8e6641e8..965b52278d 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1783,6 +1783,7 @@ int kvm_arch_init_vcpu(CPUState *cs) } case 0x1f: if (env->nr_dies < 2) { + cpuid_i--; break; } /* fallthrough */ -- Gitee From 3be32d5c71bfc45c3b5537d2266bdede6af834e3 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 24 Jan 2024 21:40:16 -0500 Subject: [PATCH 22/23] i386/cpuid: Move leaf 7 to correct group commit 0729857c707535847d7fe31d3d91eb8b2a118e3c upstream. CPUID leaf 7 was grouped together with SGX leaf 0x12 by commit b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") by mistake. SGX leaf 0x12 has its specific logic to check if subleaf (starting from 2) is valid or not by checking the bit 0:3 of corresponding EAX is 1 or not. Leaf 7 follows the logic that EAX of subleaf 0 enumerates the maximum valid subleaf. Intel-SIG: commit 0729857c7075 i386/cpuid: Move leaf 7 to correct group Backport i386/cpu bugfixes Fixes: b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") Signed-off-by: Xiaoyao Li Message-ID: <20240125024016.2521244-4-xiaoyao.li@intel.com> Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Signed-off-by: Jason Zeng --- target/i386/kvm/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 965b52278d..fc50731bf9 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1824,7 +1824,6 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; - case 0x7: case 0x12: for (j = 0; ; j++) { c->function = i; @@ -1844,6 +1843,7 @@ int kvm_arch_init_vcpu(CPUState *cs) c = &cpuid_data.entries[cpuid_i++]; } break; + case 0x7: case 0x14: case 0x1d: case 0x1e: { -- Gitee From a4b3a49219fcc4a69dbb66fc6b3218cedc0fb4b8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 20 Mar 2024 17:31:38 +0800 Subject: [PATCH 23/23] target/i386: Introduce Icelake-Server-v7 to enable TSX commit c895fa54e3060c5ac6f3888dce96c9b78626072b upstream. When start L2 guest with both L1/L2 using Icelake-Server-v3 or above, QEMU reports below warning: "warning: host doesn't support requested feature: MSR(10AH).taa-no [bit 8]" Reason is QEMU Icelake-Server-v3 has TSX feature disabled but enables taa-no bit. It's meaningless that TSX isn't supported but still claim TSX is secure. So L1 KVM doesn't expose taa-no to L2 if TSX is unsupported, then starting L2 triggers the warning. Fix it by introducing a new version Icelake-Server-v7 which has both TSX and taa-no features. Then guest can use TSX securely when it see taa-no. This matches the production Icelake which supports TSX and isn't susceptible to TSX Async Abort (TAA) vulnerabilities, a.k.a, taa-no. Ideally, TSX should have being enabled together with taa-no since v3, but for compatibility, we'd better to add v7 to enable it. Fixes: d965dc35592d ("target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model") Intel-SIG: commit c895fa54e306 target/i386: Introduce Icelake-Server-v7 to enable TSX. Tested-by: Xiangfei Ma Signed-off-by: Zhenzhong Duan Message-ID: <20240320093138.80267-2-zhenzhong.duan@intel.com> Signed-off-by: Paolo Bonzini [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- target/i386/cpu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index f151a5a403..923fa99b6e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3673,6 +3673,16 @@ static const X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, + { + .version = 7, + .note = "TSX, taa-no", + .props = (PropValue[]) { + /* Restore TSX features removed by -v2 above */ + { "hle", "on" }, + { "rtm", "on" }, + { /* end of list */ } + }, + }, { /* end of list */ } } }, -- Gitee