diff --git a/1097-Update-linux-headers-to-v6.0-rc4.patch b/1097-Update-linux-headers-to-v6.0-rc4.patch new file mode 100644 index 0000000000000000000000000000000000000000..c555c3addf29d2d318bb3c5113ab511d148b3659 --- /dev/null +++ b/1097-Update-linux-headers-to-v6.0-rc4.patch @@ -0,0 +1,92 @@ +From 6e861716749048da6e5687ba3e822620667d3252 Mon Sep 17 00:00:00 2001 +From: Jason Zeng +Date: Wed, 22 Feb 2023 13:59:37 +0800 +Subject: [PATCH 1097/1119] Update linux headers to v6.0-rc4 + +commit d525f73f9186a5bc641b8caf0b2c9bb94e5aa963 upstream. + +commit 7e18e42e4b280c85b76967a9106a13ca61c16179 + +Intel-SIG: d525f73f9186 ("Update linux headers to v6.0-rc4"). +Backport notify-vm-exit for qemu v6.2.0 + +Signed-off-by: Chenyi Qiang +Reviewed-by: Cornelia Huck +Message-Id: <20220915091035.3897-3-chenyi.qiang@intel.com> +Signed-off-by: Thomas Huth +[ jason: only include missing header changes for notify-vm-exit ] +Signed-off-by: Jason Zeng +--- + linux-headers/asm-x86/kvm.h | 6 +++++- + linux-headers/linux/kvm.h | 12 ++++++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h +index bf6e96011dfe..8791f7c228e1 100644 +--- a/linux-headers/asm-x86/kvm.h ++++ b/linux-headers/asm-x86/kvm.h +@@ -325,6 +325,7 @@ struct kvm_reinject_control { + #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 + #define KVM_VCPUEVENT_VALID_SMM 0x00000008 + #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 ++#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 + + /* Interrupt shadow states */ + #define KVM_X86_SHADOW_INT_MOV_SS 0x01 +@@ -359,7 +360,10 @@ struct kvm_vcpu_events { + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; +- __u8 reserved[27]; ++ struct { ++ __u8 pending; ++ } triple_fault; ++ __u8 reserved[26]; + __u8 exception_has_payload; + __u64 exception_payload; + }; +diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h +index 3875127a375d..05d079a199cd 100644 +--- a/linux-headers/linux/kvm.h ++++ b/linux-headers/linux/kvm.h +@@ -270,6 +270,7 @@ struct kvm_xen_exit { + #define KVM_EXIT_X86_BUS_LOCK 33 + #define KVM_EXIT_XEN 34 + #define KVM_EXIT_RISCV_SBI 35 ++#define KVM_EXIT_NOTIFY 37 + + /* For KVM_EXIT_INTERNAL_ERROR */ + /* Emulate instruction failed. */ +@@ -492,6 +493,11 @@ struct kvm_run { + unsigned long args[6]; + unsigned long ret[2]; + } riscv_sbi; ++ /* KVM_EXIT_NOTIFY */ ++ struct { ++#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) ++ __u32 flags; ++ } notify; + /* Fix the size of the union. */ + char padding[256]; + }; +@@ -1152,6 +1158,8 @@ struct kvm_ppc_resize_hpt { + /* #define KVM_CAP_VM_TSC_CONTROL 214 */ + #define KVM_CAP_SYSTEM_EVENT_DATA 215 + #define KVM_CAP_S390_PROTECTED_DUMP 217 ++#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 ++#define KVM_CAP_X86_NOTIFY_VMEXIT 219 + #define KVM_CAP_S390_ZPCI_OP 221 + #define KVM_CAP_S390_CPU_TOPOLOGY 222 + #define KVM_CAP_SEV_ES_GHCB 500 +@@ -2276,4 +2284,8 @@ struct kvm_s390_zpci_op { + /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ + #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) + ++/* Available with KVM_CAP_X86_NOTIFY_VMEXIT */ ++#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) ++#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) ++ + #endif /* __LINUX_KVM_H */ +-- +2.33.0 + diff --git a/1098-i386-kvm-extend-kvm_-get-put-_vcpu_events-to-support.patch b/1098-i386-kvm-extend-kvm_-get-put-_vcpu_events-to-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..c97c17ebc1415c8bc62eac453dd0e327a9218656 --- /dev/null +++ b/1098-i386-kvm-extend-kvm_-get-put-_vcpu_events-to-support.patch @@ -0,0 +1,150 @@ +From a89ec0c53776552ded77c8645c96e03ac30bc4a7 Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 29 Sep 2022 15:20:11 +0800 +Subject: [PATCH 1098/1119] i386: kvm: extend kvm_{get, put}_vcpu_events to + support pending triple fault + +commit 12f89a39cf3c5760cba82ce68929d748961f62df upstream. + +For the direct triple faults, i.e. hardware detected and KVM morphed +to VM-Exit, KVM will never lose them. But for triple faults sythesized +by KVM, e.g. the RSM path, if KVM exits to userspace before the request +is serviced, userspace could migrate the VM and lose the triple fault. + +A new flag KVM_VCPUEVENT_VALID_TRIPLE_FAULT is defined to signal that +the event.triple_fault_pending field contains a valid state if the +KVM_CAP_X86_TRIPLE_FAULT_EVENT capability is enabled. + +Intel-SIG: commit 12f89a39cf3c ("i386: kvm: extend kvm_{get, put}_vcpu_events to support pending triple fault"). +Backport notify-vm-exit for qemu v6.2.0 + +Acked-by: Peter Xu +Signed-off-by: Chenyi Qiang +Message-Id: <20220929072014.20705-2-chenyi.qiang@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: amend commit message ] +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 1 + + target/i386/cpu.h | 1 + + target/i386/kvm/kvm.c | 20 ++++++++++++++++++++ + target/i386/machine.c | 20 ++++++++++++++++++++ + 4 files changed, 42 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 62149367b106..67b359328569 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6577,6 +6577,7 @@ static void x86_cpu_reset(DeviceState *dev) + env->exception_has_payload = false; + env->exception_payload = 0; + env->nmi_injected = false; ++ env->triple_fault_pending = false; + #if !defined(CONFIG_USER_ONLY) + /* We hard-wire the BSP to the first CPU. */ + apic_designate_bsp(cpu->apic_state, s->cpu_index == 0); +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index e84cb8265c74..c415bb3db890 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -1736,6 +1736,7 @@ typedef struct CPUX86State { + uint8_t has_error_code; + uint8_t exception_has_payload; + uint64_t exception_payload; ++ uint8_t triple_fault_pending; + uint32_t ins_len; + uint32_t sipi_vector; + bool tsc_valid; +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index cecc3d035d8c..07c5f7dc932d 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -129,6 +129,7 @@ static int has_xcrs; + static int has_pit_state2; + static int has_exception_payload; + static int has_map_gpa_range; ++static int has_triple_fault_event; + + static bool has_msr_mcg_ext_ctl; + +@@ -2455,6 +2456,16 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + } + } + ++ has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT); ++ if (has_triple_fault_event) { ++ ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); ++ if (ret < 0) { ++ error_report("kvm: Failed to enable triple fault event cap: %s", ++ strerror(-ret)); ++ return ret; ++ } ++ } ++ + ret = kvm_get_supported_msrs(s); + if (ret < 0) { + return ret; +@@ -4114,6 +4125,11 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) + } + } + ++ if (has_triple_fault_event) { ++ events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; ++ events.triple_fault.pending = env->triple_fault_pending; ++ } ++ + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); + } + +@@ -4183,6 +4199,10 @@ static int kvm_get_vcpu_events(X86CPU *cpu) + } + } + ++ if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { ++ env->triple_fault_pending = events.triple_fault.pending; ++ } ++ + env->sipi_vector = events.sipi_vector; + + return 0; +diff --git a/target/i386/machine.c b/target/i386/machine.c +index 8aa54432d7d4..7fe80e1aa01f 100644 +--- a/target/i386/machine.c ++++ b/target/i386/machine.c +@@ -1518,6 +1518,25 @@ static const VMStateDescription vmstate_msr_ghcb_gpa = { + }; + #endif + ++static bool triple_fault_needed(void *opaque) ++{ ++ X86CPU *cpu = opaque; ++ CPUX86State *env = &cpu->env; ++ ++ return env->triple_fault_pending; ++} ++ ++static const VMStateDescription vmstate_triple_fault = { ++ .name = "cpu/triple_fault", ++ .version_id = 1, ++ .minimum_version_id = 1, ++ .needed = triple_fault_needed, ++ .fields = (VMStateField[]) { ++ VMSTATE_UINT8(env.triple_fault_pending, X86CPU), ++ VMSTATE_END_OF_LIST() ++ } ++}; ++ + const VMStateDescription vmstate_x86_cpu = { + .name = "cpu", + .version_id = 12, +@@ -1663,6 +1682,7 @@ const VMStateDescription vmstate_x86_cpu = { + #if defined(CONFIG_KVM) && defined(TARGET_X86_64) + &vmstate_msr_ghcb_gpa, + #endif ++ &vmstate_triple_fault, + NULL + } + }; +-- +2.33.0 + diff --git a/1099-kvm-allow-target-specific-accelerator-properties.patch b/1099-kvm-allow-target-specific-accelerator-properties.patch new file mode 100644 index 0000000000000000000000000000000000000000..14e4fc25e3d7f641dbbfa4330ecbf36bb8340d3b --- /dev/null +++ b/1099-kvm-allow-target-specific-accelerator-properties.patch @@ -0,0 +1,135 @@ +From 96243c578cb34d94acf2c481cc254d3b616219d1 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 29 Sep 2022 15:20:12 +0800 +Subject: [PATCH 1099/1119] kvm: allow target-specific accelerator properties + +commit 3dba0a335cf5c53146b606be6ddfab4df81c464e upstream. + +Several hypervisor capabilities in KVM are target-specific. When exposed +to QEMU users as accelerator properties (i.e. -accel kvm,prop=value), they +should not be available for all targets. + +Add a hook for targets to add their own properties to -accel kvm, for +now no such property is defined. + +Intel-SIG: commit 3dba0a335cf5 ("kvm: allow target-specific accelerator properties"). +Backport notify-vm-exit for qemu v6.2.0 + +Signed-off-by: Paolo Bonzini +Message-Id: <20220929072014.20705-3-chenyi.qiang@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: remove changes in target/riscv/kvm.c since riscv kvm is not + supported in qemu-6.2.0, meanwhile add loongarch64 support ] +Signed-off-by: Jason Zeng +--- + accel/kvm/kvm-all.c | 2 ++ + include/sysemu/kvm.h | 2 ++ + target/arm/kvm.c | 4 ++++ + target/i386/kvm/kvm.c | 4 ++++ + target/loongarch64/kvm.c | 4 ++++ + target/mips/kvm.c | 4 ++++ + target/ppc/kvm.c | 4 ++++ + target/s390x/kvm/kvm.c | 4 ++++ + 8 files changed, 28 insertions(+) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 10af4170d938..573b2ba38269 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3759,6 +3759,8 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data) + NULL, NULL); + object_class_property_set_description(oc, "dirty-ring-size", + "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); ++ ++ kvm_arch_accel_class_init(oc); + } + + static const TypeInfo kvm_accel_type = { +diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h +index 9f8099f48769..4bcec46a4402 100644 +--- a/include/sysemu/kvm.h ++++ b/include/sysemu/kvm.h +@@ -334,6 +334,8 @@ bool kvm_device_supported(int vmfd, uint64_t type); + + extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; + ++void kvm_arch_accel_class_init(ObjectClass *oc); ++ + void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run); + MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run); + +diff --git a/target/arm/kvm.c b/target/arm/kvm.c +index 1ae4e510558f..7bec3e5f4ff8 100644 +--- a/target/arm/kvm.c ++++ b/target/arm/kvm.c +@@ -1059,3 +1059,7 @@ bool kvm_arch_cpu_check_are_resettable(void) + { + return true; + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 07c5f7dc932d..6f5c1a804b4c 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -5424,3 +5424,7 @@ void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) + mask &= ~BIT_ULL(bit); + } + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +diff --git a/target/loongarch64/kvm.c b/target/loongarch64/kvm.c +index 0eaabe39436c..8d2f84995e2a 100644 +--- a/target/loongarch64/kvm.c ++++ b/target/loongarch64/kvm.c +@@ -1417,3 +1417,7 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) + { + abort(); + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +diff --git a/target/mips/kvm.c b/target/mips/kvm.c +index 086debd9f013..f80ac72dd185 100644 +--- a/target/mips/kvm.c ++++ b/target/mips/kvm.c +@@ -1295,3 +1295,7 @@ bool kvm_arch_cpu_check_are_resettable(void) + { + return true; + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c +index 154888cce509..6e15b1f2333e 100644 +--- a/target/ppc/kvm.c ++++ b/target/ppc/kvm.c +@@ -2986,3 +2986,7 @@ void kvmppc_svm_allow(Error **errp) + error_setg(errp, "Error enabling x-svm-allowed, try x-svm-allowed=off"); + } + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c +index 6d1a6324b9e2..dbb911c83251 100644 +--- a/target/s390x/kvm/kvm.c ++++ b/target/s390x/kvm/kvm.c +@@ -2677,3 +2677,7 @@ int kvm_s390_get_zpci_op(void) + { + return cap_zpci_op; + } ++ ++void kvm_arch_accel_class_init(ObjectClass *oc) ++{ ++} +-- +2.33.0 + diff --git a/1100-kvm-expose-struct-KVMState.patch b/1100-kvm-expose-struct-KVMState.patch new file mode 100644 index 0000000000000000000000000000000000000000..c732bd0996dc9910268ed89f0140959d2f5a8b40 --- /dev/null +++ b/1100-kvm-expose-struct-KVMState.patch @@ -0,0 +1,212 @@ +From aea4838be0c5b7eb5d128462e87b5884d42c72d6 Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 29 Sep 2022 15:20:13 +0800 +Subject: [PATCH 1100/1119] kvm: expose struct KVMState + +commit 5f8a6bce1f1080058ed29d716cae81ea805142ae upstream. + +Expose struct KVMState out of kvm-all.c so that the field of struct +KVMState can be accessed when defining target-specific accelerator +properties. + +Intel-SIG: commit 5f8a6bce1f10 ("kvm: expose struct KVMState"). +Backport notify-vm-exit for qemu v6.2.0 + +Signed-off-by: Chenyi Qiang +Message-Id: <20220929072014.20705-4-chenyi.qiang@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: amend commit message ] +Signed-off-by: Jason Zeng +--- + accel/kvm/kvm-all.c | 74 -------------------------------------- + include/sysemu/kvm_int.h | 76 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 74 deletions(-) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index 573b2ba38269..e8faa95059f1 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -73,86 +73,12 @@ + do { } while (0) + #endif + +-#define KVM_MSI_HASHTAB_SIZE 256 +- + struct KVMParkedVcpu { + unsigned long vcpu_id; + int kvm_fd; + QLIST_ENTRY(KVMParkedVcpu) node; + }; + +-enum KVMDirtyRingReaperState { +- KVM_DIRTY_RING_REAPER_NONE = 0, +- /* The reaper is sleeping */ +- KVM_DIRTY_RING_REAPER_WAIT, +- /* The reaper is reaping for dirty pages */ +- KVM_DIRTY_RING_REAPER_REAPING, +-}; +- +-/* +- * KVM reaper instance, responsible for collecting the KVM dirty bits +- * via the dirty ring. +- */ +-struct KVMDirtyRingReaper { +- /* The reaper thread */ +- QemuThread reaper_thr; +- volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ +- volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ +-}; +- +-struct KVMState +-{ +- AccelState parent_obj; +- +- int nr_slots; +- int fd; +- int vmfd; +- int coalesced_mmio; +- int coalesced_pio; +- struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; +- bool coalesced_flush_in_progress; +- int vcpu_events; +- int robust_singlestep; +- int debugregs; +-#ifdef KVM_CAP_SET_GUEST_DEBUG +- QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; +-#endif +- int max_nested_state_len; +- int many_ioeventfds; +- int intx_set_mask; +- int kvm_shadow_mem; +- bool kernel_irqchip_allowed; +- bool kernel_irqchip_required; +- OnOffAuto kernel_irqchip_split; +- bool sync_mmu; +- uint64_t manual_dirty_log_protect; +- /* The man page (and posix) say ioctl numbers are signed int, but +- * they're not. Linux, glibc and *BSD all treat ioctl numbers as +- * unsigned, and treating them as signed here can break things */ +- unsigned irq_set_ioctl; +- unsigned int sigmask_len; +- GHashTable *gsimap; +-#ifdef KVM_CAP_IRQ_ROUTING +- struct kvm_irq_routing *irq_routes; +- int nr_allocated_irq_routes; +- unsigned long *used_gsi_bitmap; +- unsigned int gsi_count; +- QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; +-#endif +- KVMMemoryListener memory_listener; +- QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; +- +- /* For "info mtree -f" to tell if an MR is registered in KVM */ +- int nr_as; +- struct KVMAs { +- KVMMemoryListener *ml; +- AddressSpace *as; +- } *as; +- uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ +- uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ +- struct KVMDirtyRingReaper reaper; +-}; +- + KVMState *kvm_state; + bool kvm_kernel_irqchip; + bool kvm_split_irqchip; +diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h +index 7e18c0a3c0a7..60b520a13e84 100644 +--- a/include/sysemu/kvm_int.h ++++ b/include/sysemu/kvm_int.h +@@ -10,6 +10,7 @@ + #define QEMU_KVM_INT_H + + #include "exec/memory.h" ++#include "qapi/qapi-types-common.h" + #include "qemu/accel.h" + #include "qemu/queue.h" + #include "sysemu/kvm.h" +@@ -44,6 +45,81 @@ typedef struct KVMMemoryListener { + QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del; + } KVMMemoryListener; + ++#define KVM_MSI_HASHTAB_SIZE 256 ++ ++enum KVMDirtyRingReaperState { ++ KVM_DIRTY_RING_REAPER_NONE = 0, ++ /* The reaper is sleeping */ ++ KVM_DIRTY_RING_REAPER_WAIT, ++ /* The reaper is reaping for dirty pages */ ++ KVM_DIRTY_RING_REAPER_REAPING, ++}; ++ ++/* ++ * KVM reaper instance, responsible for collecting the KVM dirty bits ++ * via the dirty ring. ++ */ ++struct KVMDirtyRingReaper { ++ /* The reaper thread */ ++ QemuThread reaper_thr; ++ volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ ++ volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ ++}; ++struct KVMState ++{ ++ AccelState parent_obj; ++ ++ int nr_slots; ++ int fd; ++ int vmfd; ++ int coalesced_mmio; ++ int coalesced_pio; ++ struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; ++ bool coalesced_flush_in_progress; ++ int vcpu_events; ++ int robust_singlestep; ++ int debugregs; ++#ifdef KVM_CAP_SET_GUEST_DEBUG ++ QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; ++#endif ++ int max_nested_state_len; ++ int many_ioeventfds; ++ int intx_set_mask; ++ int kvm_shadow_mem; ++ bool kernel_irqchip_allowed; ++ bool kernel_irqchip_required; ++ OnOffAuto kernel_irqchip_split; ++ bool sync_mmu; ++ uint64_t manual_dirty_log_protect; ++ /* The man page (and posix) say ioctl numbers are signed int, but ++ * they're not. Linux, glibc and *BSD all treat ioctl numbers as ++ * unsigned, and treating them as signed here can break things */ ++ unsigned irq_set_ioctl; ++ unsigned int sigmask_len; ++ GHashTable *gsimap; ++#ifdef KVM_CAP_IRQ_ROUTING ++ struct kvm_irq_routing *irq_routes; ++ int nr_allocated_irq_routes; ++ unsigned long *used_gsi_bitmap; ++ unsigned int gsi_count; ++ QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; ++#endif ++ KVMMemoryListener memory_listener; ++ QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; ++ ++ /* For "info mtree -f" to tell if an MR is registered in KVM */ ++ int nr_as; ++ struct KVMAs { ++ KVMMemoryListener *ml; ++ AddressSpace *as; ++ } *as; ++ uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ ++ uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ ++ struct KVMDirtyRingReaper reaper; ++ NotifyVmexitOption notify_vmexit; ++ uint32_t notify_window; ++}; ++ + void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, + AddressSpace *as, int as_id, const char *name); + +-- +2.33.0 + diff --git a/1101-i386-add-notify-VM-exit-support.patch b/1101-i386-add-notify-VM-exit-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..22496a88077e7b03752bc7eb11bc6e078a9f081d --- /dev/null +++ b/1101-i386-add-notify-VM-exit-support.patch @@ -0,0 +1,264 @@ +From bbde51e14341c3d4056c9b0018be9fa31d02c98c Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 29 Sep 2022 15:20:14 +0800 +Subject: [PATCH 1101/1119] i386: add notify VM exit support + +commit e2e69f6bb907a70ac518230c54e98e7abcb0c911 upstream. + +There are cases that malicious virtual machine can cause CPU stuck (due +to event windows don't open up), e.g., infinite loop in microcode when +nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and +IRQ) can be delivered. It leads the CPU to be unavailable to host or +other VMs. Notify VM exit is introduced to mitigate such kind of +attacks, which will generate a VM exit if no event window occurs in VM +non-root mode for a specified amount of time (notify window). + +A new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT is exposed to user space +so that the user can query the capability and set the expected notify +window when creating VMs. The format of the argument when enabling this +capability is as follows: + Bit 63:32 - notify window specified in qemu command + Bit 31:0 - some flags (e.g. KVM_X86_NOTIFY_VMEXIT_ENABLED is set to + enable the feature.) + +Users can configure the feature by a new (x86 only) accel property: + qemu -accel kvm,notify-vmexit=run|internal-error|disable,notify-window=n + +The default option of notify-vmexit is run, which will enable the +capability and do nothing if the exit happens. The internal-error option +raises a KVM internal error if it happens. The disable option does not +enable the capability. The default value of notify-window is 0. It is valid +only when notify-vmexit is not disabled. The valid range of notify-window +is non-negative. It is even safe to set it to zero since there's an +internal hardware threshold to be added to ensure no false positive. + +Because a notify VM exit may happen with VM_CONTEXT_INVALID set in exit +qualification (no cases are anticipated that would set this bit), which +means VM context is corrupted. It would be reflected in the flags of +KVM_EXIT_NOTIFY exit. If KVM_NOTIFY_CONTEXT_INVALID bit is set, raise a KVM +internal error unconditionally. + +Intel-SIG: commit e2e69f6bb907 ("i386: add notify VM exit support"). +Backport notify-vm-exit for qemu v6.2.0 + +Acked-by: Peter Xu +Signed-off-by: Chenyi Qiang +Message-Id: <20220929072014.20705-5-chenyi.qiang@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: amend commit message ] +Signed-off-by: Jason Zeng +--- + accel/kvm/kvm-all.c | 2 + + qapi/run-state.json | 17 ++++++++ + qemu-options.hx | 11 +++++ + target/i386/kvm/kvm.c | 98 +++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 128 insertions(+) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index e8faa95059f1..328fccd7fac1 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -3658,6 +3658,8 @@ static void kvm_accel_instance_init(Object *obj) + s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; + /* KVM dirty ring is by default off */ + s->kvm_dirty_ring_size = 0; ++ s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN; ++ s->notify_window = 0; + } + + static void kvm_accel_class_init(ObjectClass *oc, void *data) +diff --git a/qapi/run-state.json b/qapi/run-state.json +index 43d66d700fcd..08c38b2c67bf 100644 +--- a/qapi/run-state.json ++++ b/qapi/run-state.json +@@ -638,3 +638,20 @@ + { 'struct': 'MemoryFailureFlags', + 'data': { 'action-required': 'bool', + 'recursive': 'bool'} } ++ ++## ++# @NotifyVmexitOption: ++# ++# An enumeration of the options specified when enabling notify VM exit ++# ++# @run: enable the feature, do nothing and continue if the notify VM exit happens. ++# ++# @internal-error: enable the feature, raise a internal error if the notify ++# VM exit happens. ++# ++# @disable: disable the feature. ++# ++# Since: 7.2 ++## ++{ 'enum': 'NotifyVmexitOption', ++ 'data': [ 'run', 'internal-error', 'disable' ] } +\ No newline at end of file +diff --git a/qemu-options.hx b/qemu-options.hx +index eaf16a68e0c8..c4dc51c37c4b 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -152,6 +152,7 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel, + " split-wx=on|off (enable TCG split w^x mapping)\n" + " tb-size=n (TCG translation block cache size)\n" + " dirty-ring-size=n (KVM dirty ring GFN count, default 0)\n" ++ " notify-vmexit=run|internal-error|disable,notify-window=n (enable notify VM exit and set notify window, x86 only)\n" + " thread=single|multi (enable multi-threaded TCG)\n", QEMU_ARCH_ALL) + SRST + ``-accel name[,prop=value[,...]]`` +@@ -203,6 +204,16 @@ SRST + is disabled (dirty-ring-size=0). When enabled, KVM will instead + record dirty pages in a bitmap. + ++ ``notify-vmexit=run|internal-error|disable,notify-window=n`` ++ Enables or disables notify VM exit support on x86 host and specify ++ the corresponding notify window to trigger the VM exit if enabled. ++ ``run`` option enables the feature. It does nothing and continue ++ if the exit happens. ``internal-error`` option enables the feature. ++ It raises a internal error. ``disable`` option doesn't enable the feature. ++ This feature can mitigate the CPU stuck issue due to event windows don't ++ open up for a specified of time (i.e. notify-window). ++ Default: notify-vmexit=run,notify-window=0. ++ + ERST + + DEF("smp", HAS_ARG, QEMU_OPTION_smp, +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 6f5c1a804b4c..ae50575edc7e 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -15,6 +15,7 @@ + #include "qemu/osdep.h" + #include "qapi/qapi-events-run-state.h" + #include "qapi/error.h" ++#include "qapi/visitor.h" + #include + #include + #include +@@ -2591,6 +2592,21 @@ int kvm_arch_init(MachineState *ms, KVMState *s) + } + } + ++ if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE && ++ kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { ++ uint64_t notify_window_flags = ++ ((uint64_t)s->notify_window << 32) | ++ KVM_X86_NOTIFY_VMEXIT_ENABLED | ++ KVM_X86_NOTIFY_VMEXIT_USER; ++ ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, ++ notify_window_flags); ++ if (ret < 0) { ++ error_report("kvm: Failed to enable notify vmexit cap: %s", ++ strerror(-ret)); ++ return ret; ++ } ++ } ++ + return 0; + } + +@@ -5082,6 +5098,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) + X86CPU *cpu = X86_CPU(cs); + uint64_t code; + int ret; ++ bool ctx_invalid; ++ char str[256]; ++ KVMState *state; + + switch (run->exit_reason) { + case KVM_EXIT_HLT: +@@ -5150,6 +5169,21 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) + case KVM_EXIT_HYPERCALL: + ret = kvm_handle_exit_hypercall(cpu, run); + break; ++ case KVM_EXIT_NOTIFY: ++ ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID); ++ state = KVM_STATE(current_accel()); ++ sprintf(str, "Encounter a notify exit with %svalid context in" ++ " guest. There can be possible misbehaves in guest." ++ " Please have a look.", ctx_invalid ? "in" : ""); ++ if (ctx_invalid || ++ state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) { ++ warn_report("KVM internal error: %s", str); ++ ret = -1; ++ } else { ++ warn_report_once("KVM: %s", str); ++ ret = 0; ++ } ++ break; + default: + fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); + ret = -1; +@@ -5425,6 +5459,70 @@ void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) + } + } + ++static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp) ++{ ++ KVMState *s = KVM_STATE(obj); ++ return s->notify_vmexit; ++} ++ ++static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp) ++{ ++ KVMState *s = KVM_STATE(obj); ++ ++ if (s->fd != -1) { ++ error_setg(errp, "Cannot set properties after the accelerator has been initialized"); ++ return; ++ } ++ ++ s->notify_vmexit = value; ++} ++ ++static void kvm_arch_get_notify_window(Object *obj, Visitor *v, ++ const char *name, void *opaque, ++ Error **errp) ++{ ++ KVMState *s = KVM_STATE(obj); ++ uint32_t value = s->notify_window; ++ ++ visit_type_uint32(v, name, &value, errp); ++} ++ ++static void kvm_arch_set_notify_window(Object *obj, Visitor *v, ++ const char *name, void *opaque, ++ Error **errp) ++{ ++ KVMState *s = KVM_STATE(obj); ++ Error *error = NULL; ++ uint32_t value; ++ ++ if (s->fd != -1) { ++ error_setg(errp, "Cannot set properties after the accelerator has been initialized"); ++ return; ++ } ++ ++ visit_type_uint32(v, name, &value, &error); ++ if (error) { ++ error_propagate(errp, error); ++ return; ++ } ++ ++ s->notify_window = value; ++} ++ + void kvm_arch_accel_class_init(ObjectClass *oc) + { ++ object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption", ++ &NotifyVmexitOption_lookup, ++ kvm_arch_get_notify_vmexit, ++ kvm_arch_set_notify_vmexit); ++ object_class_property_set_description(oc, "notify-vmexit", ++ "Enable notify VM exit"); ++ ++ object_class_property_add(oc, "notify-window", "uint32", ++ kvm_arch_get_notify_window, ++ kvm_arch_set_notify_window, ++ NULL, NULL); ++ object_class_property_set_description(oc, "notify-window", ++ "Clock cycles without an event window " ++ "after which a notification VM exit occurs"); + } +-- +2.33.0 + diff --git a/1102-target-i386-Fix-sanity-check-on-max-APIC-ID-X2APIC-e.patch b/1102-target-i386-Fix-sanity-check-on-max-APIC-ID-X2APIC-e.patch new file mode 100644 index 0000000000000000000000000000000000000000..cada32d98944a4038e3f5655877829afba9ccc7c --- /dev/null +++ b/1102-target-i386-Fix-sanity-check-on-max-APIC-ID-X2APIC-e.patch @@ -0,0 +1,104 @@ +From a7dc028b392e5ae8e2e258396ce3d921720c2bb2 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Mon, 14 Mar 2022 14:25:41 +0000 +Subject: [PATCH 1102/1119] target/i386: Fix sanity check on max APIC ID / + X2APIC enablement + +commit dc89f32d92bba795b0665f075b78d8881cf67ab3 upstream. + +The check on x86ms->apic_id_limit in pc_machine_done() had two problems. + +Firstly, we need KVM to support the X2APIC API in order to allow IRQ +delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(), +which was done elsewhere in *some* cases but not all. + +Secondly, microvm needs the same check. So move it from pc_machine_done() +to x86_cpus_init() where it will work for both. + +The check in kvm_cpu_instance_init() is now redundant and can be dropped. + +Intel-SIG: commit dc89f32d92bb ("target/i386: Fix sanity check on max APIC ID / X2APIC enablement"). +Backport bugfix of max APIC ID / X2APIC for qemu-6.2.0 + +Signed-off-by: David Woodhouse +Acked-by: Claudio Fontana +Message-Id: <20220314142544.150555-1-dwmw2@infradead.org> +Reviewed-by: Michael S. Tsirkin +Signed-off-by: Michael S. Tsirkin +[ jason: amend commit log ] +Signed-off-by: Jason Zeng +--- + hw/i386/pc.c | 8 -------- + hw/i386/x86.c | 16 ++++++++++++++++ + target/i386/kvm/kvm-cpu.c | 2 +- + 3 files changed, 17 insertions(+), 9 deletions(-) + +diff --git a/hw/i386/pc.c b/hw/i386/pc.c +index 357257349be2..5b926bed11f4 100644 +--- a/hw/i386/pc.c ++++ b/hw/i386/pc.c +@@ -1035,14 +1035,6 @@ void pc_machine_done(Notifier *notifier, void *data) + /* update FW_CFG_NB_CPUS to account for -device added CPUs */ + fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus); + } +- +- +- if (x86ms->apic_id_limit > 255 && !xen_enabled() && +- !kvm_irqchip_in_kernel()) { +- error_report("current -smp configuration requires kernel " +- "irqchip support."); +- exit(EXIT_FAILURE); +- } + } + + void pc_guest_info_init(PCMachineState *pcms) +diff --git a/hw/i386/x86.c b/hw/i386/x86.c +index b84840a1bb99..f64639b8731c 100644 +--- a/hw/i386/x86.c ++++ b/hw/i386/x86.c +@@ -39,6 +39,7 @@ + #include "sysemu/replay.h" + #include "sysemu/sysemu.h" + #include "sysemu/cpu-timers.h" ++#include "sysemu/xen.h" + #include "trace.h" + + #include "hw/i386/x86.h" +@@ -136,6 +137,21 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) + */ + x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms, + ms->smp.max_cpus - 1) + 1; ++ ++ /* ++ * Can we support APIC ID 255 or higher? ++ * ++ * Under Xen: yes. ++ * With userspace emulated lapic: no ++ * With KVM's in-kernel lapic: only if X2APIC API is enabled. ++ */ ++ if (x86ms->apic_id_limit > 255 && !xen_enabled() && ++ (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) { ++ error_report("current -smp configuration requires kernel " ++ "irqchip and X2APIC API support."); ++ exit(EXIT_FAILURE); ++ } ++ + possible_cpus = mc->possible_cpu_arch_ids(ms); + for (i = 0; i < ms->smp.cpus; i++) { + x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); +diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c +index 74c1396a93ff..7b8a3d5af03b 100644 +--- a/target/i386/kvm/kvm-cpu.c ++++ b/target/i386/kvm/kvm-cpu.c +@@ -172,7 +172,7 @@ static void kvm_cpu_instance_init(CPUState *cs) + /* only applies to builtin_x86_defs cpus */ + if (!kvm_irqchip_in_kernel()) { + x86_cpu_change_kvm_default("x2apic", "off"); +- } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) { ++ } else if (kvm_irqchip_is_split()) { + x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on"); + } + +-- +2.33.0 + diff --git a/1103-target-i386-Set-maximum-APIC-ID-to-KVM-prior-to-vCPU.patch b/1103-target-i386-Set-maximum-APIC-ID-to-KVM-prior-to-vCPU.patch new file mode 100644 index 0000000000000000000000000000000000000000..bf3e017ce58a692337e79d7d554ea9c8b22a33c1 --- /dev/null +++ b/1103-target-i386-Set-maximum-APIC-ID-to-KVM-prior-to-vCPU.patch @@ -0,0 +1,90 @@ +From 46336eed191fea4f326075f06e7dbb64f982f1db Mon Sep 17 00:00:00 2001 +From: Zeng Guang +Date: Thu, 25 Aug 2022 10:52:46 +0800 +Subject: [PATCH 1103/1119] target/i386: Set maximum APIC ID to KVM prior to + vCPU creation + +commit 19e2a9fb9da067acba95b3be83588bda5a3f6a99 upstream. + +Specify maximum possible APIC ID assigned for current VM session to KVM +prior to the creation of vCPUs. By this setting, KVM can set up VM-scoped +data structure indexed by the APIC ID, e.g. Posted-Interrupt Descriptor +pointer table to support Intel IPI virtualization, with the most optimal +memory footprint. + +It can be achieved by calling KVM_ENABLE_CAP for KVM_CAP_MAX_VCPU_ID +capability once KVM has enabled it. Ignoring the return error if KVM +doesn't support this capability yet. + +Intel-SIG: commit 19e2a9fb9da0 ("target/i386: Set maximum APIC ID to KVM prior to vCPU creation"). +Backport setting maximum APIC ID for qemu-6.2.0 + +Signed-off-by: Zeng Guang +Acked-by: Peter Xu +Acked-by: Michael S. Tsirkin +Message-Id: <20220825025246.26618-1-guang.zeng@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: amend commit log ] +Signed-off-by: Jason Zeng +--- + hw/i386/x86.c | 4 ++++ + target/i386/kvm/kvm-stub.c | 5 +++++ + target/i386/kvm/kvm.c | 5 +++++ + target/i386/kvm/kvm_i386.h | 2 ++ + 4 files changed, 16 insertions(+) + +diff --git a/hw/i386/x86.c b/hw/i386/x86.c +index f64639b8731c..a3258d78facf 100644 +--- a/hw/i386/x86.c ++++ b/hw/i386/x86.c +@@ -152,6 +152,10 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version) + exit(EXIT_FAILURE); + } + ++ if (kvm_enabled()) { ++ kvm_set_max_apic_id(x86ms->apic_id_limit); ++ } ++ + possible_cpus = mc->possible_cpu_arch_ids(ms); + for (i = 0; i < ms->smp.cpus; i++) { + x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal); +diff --git a/target/i386/kvm/kvm-stub.c b/target/i386/kvm/kvm-stub.c +index f6e7e4466e1a..e052f1c7b0ef 100644 +--- a/target/i386/kvm/kvm-stub.c ++++ b/target/i386/kvm/kvm-stub.c +@@ -44,3 +44,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) + { + abort(); + } ++ ++void kvm_set_max_apic_id(uint32_t max_apic_id) ++{ ++ return; ++} +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index ae50575edc7e..43187699456c 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -5526,3 +5526,8 @@ void kvm_arch_accel_class_init(ObjectClass *oc) + "Clock cycles without an event window " + "after which a notification VM exit occurs"); + } ++ ++void kvm_set_max_apic_id(uint32_t max_apic_id) ++{ ++ kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id); ++} +diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h +index 2ed586c11b7e..23f6cab60017 100644 +--- a/target/i386/kvm/kvm_i386.h ++++ b/target/i386/kvm/kvm_i386.h +@@ -65,4 +65,6 @@ typedef struct kvm_msr_handlers { + bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, + QEMUWRMSRHandler *wrmsr); + ++void kvm_set_max_apic_id(uint32_t max_apic_id); ++ + #endif +-- +2.33.0 + diff --git a/1104-target-i386-Add-SGX-aex-notify-and-EDECCSSA-support.patch b/1104-target-i386-Add-SGX-aex-notify-and-EDECCSSA-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..40de9192c8b873178cc6962a3214752dfc398e03 --- /dev/null +++ b/1104-target-i386-Add-SGX-aex-notify-and-EDECCSSA-support.patch @@ -0,0 +1,60 @@ +From ba1fb13629c621ca0013752502927b767abd632c Mon Sep 17 00:00:00 2001 +From: Kai Huang +Date: Wed, 9 Nov 2022 15:48:34 +1300 +Subject: [PATCH 1104/1119] target/i386: Add SGX aex-notify and EDECCSSA + support + +commit d45f24fe7525d8a8aaa4ca6d9d214dc41819caa5 upstream. + +The new SGX Asynchronous Exit (AEX) notification mechanism (AEX-notify) +allows one enclave to receive a notification in the ERESUME after the +enclave exit due to an AEX. EDECCSSA is a new SGX user leaf function +(ENCLU[EDECCSSA]) to facilitate the AEX notification handling. + +Whether the hardware supports to create enclave with AEX-notify support +is enumerated via CPUID.(EAX=0x12,ECX=0x1):EAX[10]. The new EDECCSSA +user leaf function is enumerated via CPUID.(EAX=0x12,ECX=0x0):EAX[11]. + +Add support to allow to expose the new SGX AEX-notify feature and the +new EDECCSSA user leaf function to KVM guest. + +Intel-SIG: commit d45f24fe7525 ("target/i386: Add SGX aex-notify and EDECCSSA support"). +Backport SGX aex_notify and EDECCSSA support + +Link: https://lore.kernel.org/lkml/166760360549.4906.809756297092548496.tip-bot2@tip-bot2/ +Link: https://lore.kernel.org/lkml/166760360934.4906.2427175408052308969.tip-bot2@tip-bot2/ +Reviewed-by: Yang Zhong +Signed-off-by: Kai Huang +Message-Id: <20221109024834.172705-1-kai.huang@intel.com> +Signed-off-by: Paolo Bonzini +[ jason: amend commit log ] +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 67b359328569..21300c2d1916 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1246,7 +1246,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .feat_names = { + "sgx1", "sgx2", NULL, NULL, + NULL, NULL, NULL, NULL, +- NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, "sgx-edeccssa", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +@@ -1286,7 +1286,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .feat_names = { + NULL, "sgx-debug", "sgx-mode64", NULL, + "sgx-provisionkey", "sgx-tokenkey", NULL, "sgx-kss", +- NULL, NULL, NULL, NULL, ++ NULL, NULL, "sgx-aex-notify", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +-- +2.33.0 + diff --git a/1105-target-i386-KVM-allow-fast-string-operations-if-host.patch b/1105-target-i386-KVM-allow-fast-string-operations-if-host.patch new file mode 100644 index 0000000000000000000000000000000000000000..3d12a0a42d5fc757bbcff3c25d7ac447a87710b8 --- /dev/null +++ b/1105-target-i386-KVM-allow-fast-string-operations-if-host.patch @@ -0,0 +1,74 @@ +From c8023d07ed346dc10f2228a9a2d7f2ec9436a699 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 27 Feb 2023 10:41:46 +0100 +Subject: [PATCH 1105/1119] target/i386: KVM: allow fast string operations if + host supports them + +commit 3023c9b4d1092eb27a523c08d9e78cbaec67b59b upstream. + +These are just a flag that documents the performance characteristic of +an instruction; it needs no hypervisor support. So include them even +if KVM does not show them. In particular, FZRM/FSRS/FSRC have only +been added very recently, but they are available on Sapphire Rapids +processors. + +Intel-SIG: commit 3023c9b4d109 target/i386: KVM: allow fast string operations if host supports them +Backport to add new CPU model SapphireRapids and new fast string op leaves. + +Reviewed-by: Xiaoyao Li +Signed-off-by: Paolo Bonzini +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + target/i386/kvm/kvm.c | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 43187699456c..11f2ca08ee3e 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -354,7 +354,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + { + struct kvm_cpuid2 *cpuid; + uint32_t ret = 0; +- uint32_t cpuid_1_edx; ++ uint32_t cpuid_1_edx, unused; + uint64_t bitmask; + + cpuid = get_supported_cpuid(s); +@@ -401,10 +401,20 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + } else if (function == 6 && reg == R_EAX) { + ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ + } else if (function == 7 && index == 0 && reg == R_EBX) { ++ /* Not new instructions, just an optimization. */ ++ uint32_t ebx; ++ host_cpuid(7, 0, &unused, &ebx, &unused, &unused); ++ ret |= ebx & CPUID_7_0_EBX_ERMS; ++ + if (host_tsx_broken()) { + ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE); + } + } else if (function == 7 && index == 0 && reg == R_EDX) { ++ /* Not new instructions, just an optimization. */ ++ uint32_t edx; ++ host_cpuid(7, 0, &unused, &unused, &unused, &edx); ++ ret |= edx & CPUID_7_0_EDX_FSRM; ++ + /* + * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. + * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is +@@ -413,6 +423,11 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, + if (!has_msr_arch_capabs) { + ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; + } ++ } else if (function == 7 && index == 1 && reg == R_EAX) { ++ /* Not new instructions, just an optimization. */ ++ uint32_t eax; ++ host_cpuid(7, 1, &eax, &unused, &unused, &unused); ++ ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC); + } else if (function == 0xd && index == 0 && + (reg == R_EAX || reg == R_EDX)) { + /* +-- +2.33.0 + diff --git a/1106-configure-meson-move-AVX-tests-to-meson.patch b/1106-configure-meson-move-AVX-tests-to-meson.patch new file mode 100644 index 0000000000000000000000000000000000000000..6f54e315e396a46db466d1e04f06d06078f35c8c --- /dev/null +++ b/1106-configure-meson-move-AVX-tests-to-meson.patch @@ -0,0 +1,283 @@ +From 532949663ff716af191ab1519f9f2673e550d967 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 8 Nov 2021 13:38:58 +0100 +Subject: [PATCH 1106/1119] configure, meson: move AVX tests to meson + +commit 622753d2fb501509ab03c241d476815f378d4ba5 upstream. + +For consistency with other tests, --enable-avx2 and --enable-avx512f +fail to compile on x86 systems if cpuid.h is not available. + +Intel-SIG: commit 622753d2fb50 configure, meson: move AVX tests to meson +Backport AVX512 support for xbzrle_encode_buffer. + +Reviewed-by: Richard Henderson +Signed-off-by: Paolo Bonzini +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + configure | 103 ---------------------------------- + meson.build | 50 ++++++++++++++++- + meson_options.txt | 4 ++ + scripts/meson-buildoptions.sh | 6 ++ + 4 files changed, 58 insertions(+), 105 deletions(-) + +diff --git a/configure b/configure +index 1f932f7eeb8c..baaa95222c48 100755 +--- a/configure ++++ b/configure +@@ -330,8 +330,6 @@ qom_cast_debug="yes" + trace_backends="log" + trace_file="trace" + opengl="$default_feature" +-cpuid_h="no" +-avx2_opt="$default_feature" + guest_agent="$default_feature" + guest_agent_with_vss="no" + guest_agent_ntddscsi="no" +@@ -1052,14 +1050,6 @@ for opt do + ;; + --disable-tools) want_tools="no" + ;; +- --disable-avx2) avx2_opt="no" +- ;; +- --enable-avx2) avx2_opt="yes" +- ;; +- --disable-avx512f) avx512f_opt="no" +- ;; +- --enable-avx512f) avx512f_opt="yes" +- ;; + --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane) + echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2 + ;; +@@ -1455,8 +1445,6 @@ cat << EOF + tpm TPM support + libssh ssh block device support + numa libnuma support +- avx2 AVX2 optimization support +- avx512f AVX512F optimization support + replication replication support + opengl opengl support + xfsctl xfsctl support +@@ -2895,85 +2883,6 @@ else # "$safe_stack" = "" + fi + fi + +-######################################## +-# check if cpuid.h is usable. +- +-cat > $TMPC << EOF +-#include +-int main(void) { +- unsigned a, b, c, d; +- int max = __get_cpuid_max(0, 0); +- +- if (max >= 1) { +- __cpuid(1, a, b, c, d); +- } +- +- if (max >= 7) { +- __cpuid_count(7, 0, a, b, c, d); +- } +- +- return 0; +-} +-EOF +-if compile_prog "" "" ; then +- cpuid_h=yes +-fi +- +-########################################## +-# avx2 optimization requirement check +-# +-# There is no point enabling this if cpuid.h is not usable, +-# since we won't be able to select the new routines. +- +-if test "$cpuid_h" = "yes" && test "$avx2_opt" != "no"; then +- cat > $TMPC << EOF +-#pragma GCC push_options +-#pragma GCC target("avx2") +-#include +-#include +-static int bar(void *a) { +- __m256i x = *(__m256i *)a; +- return _mm256_testz_si256(x, x); +-} +-int main(int argc, char *argv[]) { return bar(argv[0]); } +-EOF +- if compile_object "-Werror" ; then +- avx2_opt="yes" +- else +- avx2_opt="no" +- fi +-fi +- +-########################################## +-# avx512f optimization requirement check +-# +-# There is no point enabling this if cpuid.h is not usable, +-# since we won't be able to select the new routines. +-# by default, it is turned off. +-# if user explicitly want to enable it, check environment +- +-if test "$cpuid_h" = "yes" && test "$avx512f_opt" = "yes"; then +- cat > $TMPC << EOF +-#pragma GCC push_options +-#pragma GCC target("avx512f") +-#include +-#include +-static int bar(void *a) { +- __m512i x = *(__m512i *)a; +- return _mm512_test_epi64_mask(x, x); +-} +-int main(int argc, char *argv[]) +-{ +- return bar(argv[0]); +-} +-EOF +- if ! compile_object "-Werror" ; then +- avx512f_opt="no" +- fi +-else +- avx512f_opt="no" +-fi +- + ######################################## + # check if __[u]int128_t is usable. + +@@ -3585,14 +3494,6 @@ if test "$opengl" = "yes" ; then + echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak + fi + +-if test "$avx2_opt" = "yes" ; then +- echo "CONFIG_AVX2_OPT=y" >> $config_host_mak +-fi +- +-if test "$avx512f_opt" = "yes" ; then +- echo "CONFIG_AVX512F_OPT=y" >> $config_host_mak +-fi +- + # XXX: suppress that + if [ "$bsd" = "yes" ] ; then + echo "CONFIG_BSD=y" >> $config_host_mak +@@ -3625,10 +3526,6 @@ if test "$have_tsan" = "yes" && test "$have_tsan_iface_fiber" = "yes" ; then + echo "CONFIG_TSAN=y" >> $config_host_mak + fi + +-if test "$cpuid_h" = "yes" ; then +- echo "CONFIG_CPUID_H=y" >> $config_host_mak +-fi +- + if test "$int128" = "yes" ; then + echo "CONFIG_INT128=y" >> $config_host_mak + fi +diff --git a/meson.build b/meson.build +index c5fdb785696c..37922021b45a 100644 +--- a/meson.build ++++ b/meson.build +@@ -1742,6 +1742,52 @@ config_host_data.set('CONFIG_GETAUXVAL', cc.links(gnu_source_prefix + ''' + return getauxval(AT_HWCAP) == 0; + }''')) + ++have_cpuid_h = cc.links(''' ++ #include ++ int main(void) { ++ unsigned a, b, c, d; ++ unsigned max = __get_cpuid_max(0, 0); ++ ++ if (max >= 1) { ++ __cpuid(1, a, b, c, d); ++ } ++ ++ if (max >= 7) { ++ __cpuid_count(7, 0, a, b, c, d); ++ } ++ ++ return 0; ++ }''') ++config_host_data.set('CONFIG_CPUID_H', have_cpuid_h) ++ ++config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ ++ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \ ++ .require(cc.links(''' ++ #pragma GCC push_options ++ #pragma GCC target("avx2") ++ #include ++ #include ++ static int bar(void *a) { ++ __m256i x = *(__m256i *)a; ++ return _mm256_testz_si256(x, x); ++ } ++ int main(int argc, char *argv[]) { return bar(argv[0]); } ++ '''), error_message: 'AVX2 not available').allowed()) ++ ++config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ ++ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512F') \ ++ .require(cc.links(''' ++ #pragma GCC push_options ++ #pragma GCC target("avx512f") ++ #include ++ #include ++ static int bar(void *a) { ++ __m512i x = *(__m512i *)a; ++ return _mm512_test_epi64_mask(x, x); ++ } ++ int main(int argc, char *argv[]) { return bar(argv[0]); } ++ '''), error_message: 'AVX512F not available').allowed()) ++ + config_host_data.set('CONFIG_AF_VSOCK', cc.compiles(gnu_source_prefix + ''' + #include + #include +@@ -3263,8 +3309,8 @@ summary_info += {'membarrier': config_host.has_key('CONFIG_MEMBARRIER')} + summary_info += {'debug stack usage': config_host.has_key('CONFIG_DEBUG_STACK_USAGE')} + summary_info += {'mutex debugging': config_host.has_key('CONFIG_DEBUG_MUTEX')} + summary_info += {'memory allocator': get_option('malloc')} +-summary_info += {'avx2 optimization': config_host.has_key('CONFIG_AVX2_OPT')} +-summary_info += {'avx512f optimization': config_host.has_key('CONFIG_AVX512F_OPT')} ++summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} ++summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} + summary_info += {'gprof enabled': config_host.has_key('CONFIG_GPROF')} + summary_info += {'gcov': get_option('b_coverage')} + summary_info += {'thread sanitizer': config_host.has_key('CONFIG_TSAN')} +diff --git a/meson_options.txt b/meson_options.txt +index e3923237322a..e9cbe48cb90b 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -66,6 +66,10 @@ option('cfi_debug', type: 'boolean', value: 'false', + description: 'Verbose errors in case of CFI violation') + option('multiprocess', type: 'feature', value: 'auto', + description: 'Out of process device emulation support') ++option('avx2', type: 'feature', value: 'auto', ++ description: 'AVX2 optimizations') ++option('avx512f', type: 'feature', value: 'disabled', ++ description: 'AVX512F optimizations') + + option('attr', type : 'feature', value : 'auto', + description: 'attr/xattr support') +diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh +index 7a17ff42182f..b994bf16f0a6 100644 +--- a/scripts/meson-buildoptions.sh ++++ b/scripts/meson-buildoptions.sh +@@ -25,6 +25,8 @@ meson_options_help() { + printf "%s\n" ' alsa ALSA sound support' + printf "%s\n" ' attr attr/xattr support' + printf "%s\n" ' auth-pam PAM access control' ++ printf "%s\n" ' avx2 AVX2 optimizations' ++ printf "%s\n" ' avx512f AVX512F optimizations' + printf "%s\n" ' bpf eBPF support' + printf "%s\n" ' brlapi brlapi character device driver' + printf "%s\n" ' bzip2 bzip2 support for DMG images' +@@ -107,6 +109,10 @@ _meson_option_parse() { + --disable-attr) printf "%s" -Dattr=disabled ;; + --enable-auth-pam) printf "%s" -Dauth_pam=enabled ;; + --disable-auth-pam) printf "%s" -Dauth_pam=disabled ;; ++ --enable-avx2) printf "%s" -Davx2=enabled ;; ++ --disable-avx2) printf "%s" -Davx2=disabled ;; ++ --enable-avx512f) printf "%s" -Davx512f=enabled ;; ++ --disable-avx512f) printf "%s" -Davx512f=disabled ;; + --enable-bpf) printf "%s" -Dbpf=enabled ;; + --disable-bpf) printf "%s" -Dbpf=disabled ;; + --enable-brlapi) printf "%s" -Dbrlapi=enabled ;; +-- +2.33.0 + diff --git a/1107-AVX512-support-for-xbzrle_encode_buffer.patch b/1107-AVX512-support-for-xbzrle_encode_buffer.patch new file mode 100644 index 0000000000000000000000000000000000000000..9a7d731179ebe398980718ea0c6847ed81d40215 --- /dev/null +++ b/1107-AVX512-support-for-xbzrle_encode_buffer.patch @@ -0,0 +1,303 @@ +From 51dc6acdaa0b68f38d6d448b8187d1c79dc3f566 Mon Sep 17 00:00:00 2001 +From: ling xu +Date: Wed, 16 Nov 2022 23:29:22 +0800 +Subject: [PATCH 1107/1119] AVX512 support for xbzrle_encode_buffer + +commit 04ffce137b6d85ab4e7687e54e4dffcef0a9ab99 upstream. + +This commit is the same with [PATCH v6 1/2], and provides avx512 support for xbzrle_encode_buffer +function to accelerate xbzrle encoding speed. Runtime check of avx512 +support and benchmark for this feature are added. Compared with C +version of xbzrle_encode_buffer function, avx512 version can achieve +50%-70% performance improvement on benchmarking. In addition, if dirty +data is randomly located in 4K page, the avx512 version can achieve +almost 140% performance gain. + +Intel-SIG: commit 04ffce137b6d AVX512 support for xbzrle_encode_buffer +Backport AVX512 support for xbzrle_encode_buffer. + +Signed-off-by: ling xu +Co-authored-by: Zhou Zhao +Co-authored-by: Jun Jin +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + meson.build | 17 +++++ + meson_options.txt | 2 + + migration/ram.c | 34 +++++++++- + migration/xbzrle.c | 124 ++++++++++++++++++++++++++++++++++ + migration/xbzrle.h | 4 ++ + scripts/meson-buildoptions.sh | 3 + + 6 files changed, 181 insertions(+), 3 deletions(-) + +diff --git a/meson.build b/meson.build +index 37922021b45a..a995f62d51bd 100644 +--- a/meson.build ++++ b/meson.build +@@ -1808,6 +1808,22 @@ config_host_data.set('CONFIG_AF_VSOCK', cc.compiles(gnu_source_prefix + ''' + return -1; + }''')) + ++config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ ++ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ ++ .require(cc.links(''' ++ #pragma GCC push_options ++ #pragma GCC target("avx512bw") ++ #include ++ #include ++ static int bar(void *a) { ++ ++ __m512i *x = a; ++ __m512i res= _mm512_abs_epi8(*x); ++ return res[1]; ++ } ++ int main(int argc, char *argv[]) { return bar(argv[0]); } ++ '''), error_message: 'AVX512BW not available').allowed()) ++ + ignored = ['CONFIG_QEMU_INTERP_PREFIX', # actually per-target + 'HAVE_GDB_BIN'] + arrays = ['CONFIG_BDRV_RW_WHITELIST', 'CONFIG_BDRV_RO_WHITELIST'] +@@ -3310,6 +3326,7 @@ summary_info += {'debug stack usage': config_host.has_key('CONFIG_DEBUG_STACK_US + summary_info += {'mutex debugging': config_host.has_key('CONFIG_DEBUG_MUTEX')} + summary_info += {'memory allocator': get_option('malloc')} + summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} ++summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')} + summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} + summary_info += {'gprof enabled': config_host.has_key('CONFIG_GPROF')} + summary_info += {'gcov': get_option('b_coverage')} +diff --git a/meson_options.txt b/meson_options.txt +index e9cbe48cb90b..ec9c3c0a05e2 100644 +--- a/meson_options.txt ++++ b/meson_options.txt +@@ -70,6 +70,8 @@ option('avx2', type: 'feature', value: 'auto', + description: 'AVX2 optimizations') + option('avx512f', type: 'feature', value: 'disabled', + description: 'AVX512F optimizations') ++option('avx512bw', type: 'feature', value: 'auto', ++ description: 'AVX512BW optimizations') + + option('attr', type : 'feature', value : 'auto', + description: 'attr/xattr support') +diff --git a/migration/ram.c b/migration/ram.c +index 727fe801dbe9..efb6b1560489 100644 +--- a/migration/ram.c ++++ b/migration/ram.c +@@ -102,6 +102,34 @@ static inline bool is_zero_range(uint8_t *p, uint64_t size) + return buffer_is_zero(p, size); + } + ++int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, ++ uint8_t *, int) = xbzrle_encode_buffer; ++#if defined(CONFIG_AVX512BW_OPT) ++#include "qemu/cpuid.h" ++static void __attribute__((constructor)) init_cpu_flag(void) ++{ ++ unsigned max = __get_cpuid_max(0, NULL); ++ int a, b, c, d; ++ if (max >= 1) { ++ __cpuid(1, a, b, c, d); ++ /* We must check that AVX is not just available, but usable. */ ++ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { ++ int bv; ++ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); ++ __cpuid_count(7, 0, a, b, c, d); ++ /* 0xe6: ++ * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 ++ * and ZMM16-ZMM31 state are enabled by OS) ++ * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) ++ */ ++ if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { ++ xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; ++ } ++ } ++ } ++} ++#endif ++ + XBZRLECacheStats xbzrle_counters; + + /* struct contains XBZRLE cache and a static page +@@ -766,9 +794,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, + memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); + + /* XBZRLE encoding (if there is no overflow) */ +- encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, +- TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +- TARGET_PAGE_SIZE); ++ encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, ++ TARGET_PAGE_SIZE, XBZRLE.encoded_buf, ++ TARGET_PAGE_SIZE); + + /* + * Update the cache contents, so that it corresponds to the data +diff --git a/migration/xbzrle.c b/migration/xbzrle.c +index 1ba482ded9c4..05366e86c05a 100644 +--- a/migration/xbzrle.c ++++ b/migration/xbzrle.c +@@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) + + return d; + } ++ ++#if defined(CONFIG_AVX512BW_OPT) ++#pragma GCC push_options ++#pragma GCC target("avx512bw") ++#include ++int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, ++ uint8_t *dst, int dlen) ++{ ++ uint32_t zrun_len = 0, nzrun_len = 0; ++ int d = 0, i = 0, num = 0; ++ uint8_t *nzrun_start = NULL; ++ /* add 1 to include residual part in main loop */ ++ uint32_t count512s = (slen >> 6) + 1; ++ /* countResidual is tail of data, i.e., countResidual = slen % 64 */ ++ uint32_t count_residual = slen & 0b111111; ++ bool never_same = true; ++ uint64_t mask_residual = 1; ++ mask_residual <<= count_residual; ++ mask_residual -= 1; ++ __m512i r = _mm512_set1_epi32(0); ++ ++ while (count512s) { ++ if (d + 2 > dlen) { ++ return -1; ++ } ++ ++ int bytes_to_check = 64; ++ uint64_t mask = 0xffffffffffffffff; ++ if (count512s == 1) { ++ bytes_to_check = count_residual; ++ mask = mask_residual; ++ } ++ __m512i old_data = _mm512_mask_loadu_epi8(r, ++ mask, old_buf + i); ++ __m512i new_data = _mm512_mask_loadu_epi8(r, ++ mask, new_buf + i); ++ uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data); ++ count512s--; ++ ++ bool is_same = (comp & 0x1); ++ while (bytes_to_check) { ++ if (is_same) { ++ if (nzrun_len) { ++ d += uleb128_encode_small(dst + d, nzrun_len); ++ if (d + nzrun_len > dlen) { ++ return -1; ++ } ++ nzrun_start = new_buf + i - nzrun_len; ++ memcpy(dst + d, nzrun_start, nzrun_len); ++ d += nzrun_len; ++ nzrun_len = 0; ++ } ++ /* 64 data at a time for speed */ ++ if (count512s && (comp == 0xffffffffffffffff)) { ++ i += 64; ++ zrun_len += 64; ++ break; ++ } ++ never_same = false; ++ num = __builtin_ctzll(~comp); ++ num = (num < bytes_to_check) ? num : bytes_to_check; ++ zrun_len += num; ++ bytes_to_check -= num; ++ comp >>= num; ++ i += num; ++ if (bytes_to_check) { ++ /* still has different data after same data */ ++ d += uleb128_encode_small(dst + d, zrun_len); ++ zrun_len = 0; ++ } else { ++ break; ++ } ++ } ++ if (never_same || zrun_len) { ++ /* ++ * never_same only acts if ++ * data begins with diff in first count512s ++ */ ++ d += uleb128_encode_small(dst + d, zrun_len); ++ zrun_len = 0; ++ never_same = false; ++ } ++ /* has diff, 64 data at a time for speed */ ++ if ((bytes_to_check == 64) && (comp == 0x0)) { ++ i += 64; ++ nzrun_len += 64; ++ break; ++ } ++ num = __builtin_ctzll(comp); ++ num = (num < bytes_to_check) ? num : bytes_to_check; ++ nzrun_len += num; ++ bytes_to_check -= num; ++ comp >>= num; ++ i += num; ++ if (bytes_to_check) { ++ /* mask like 111000 */ ++ d += uleb128_encode_small(dst + d, nzrun_len); ++ /* overflow */ ++ if (d + nzrun_len > dlen) { ++ return -1; ++ } ++ nzrun_start = new_buf + i - nzrun_len; ++ memcpy(dst + d, nzrun_start, nzrun_len); ++ d += nzrun_len; ++ nzrun_len = 0; ++ is_same = true; ++ } ++ } ++ } ++ ++ if (nzrun_len != 0) { ++ d += uleb128_encode_small(dst + d, nzrun_len); ++ /* overflow */ ++ if (d + nzrun_len > dlen) { ++ return -1; ++ } ++ nzrun_start = new_buf + i - nzrun_len; ++ memcpy(dst + d, nzrun_start, nzrun_len); ++ d += nzrun_len; ++ } ++ return d; ++} ++#pragma GCC pop_options ++#endif +diff --git a/migration/xbzrle.h b/migration/xbzrle.h +index a0db507b9cd9..6feb49160adf 100644 +--- a/migration/xbzrle.h ++++ b/migration/xbzrle.h +@@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, + uint8_t *dst, int dlen); + + int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); ++#if defined(CONFIG_AVX512BW_OPT) ++int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, ++ uint8_t *dst, int dlen); ++#endif + #endif +diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh +index b994bf16f0a6..8c00cce41183 100644 +--- a/scripts/meson-buildoptions.sh ++++ b/scripts/meson-buildoptions.sh +@@ -26,6 +26,7 @@ meson_options_help() { + printf "%s\n" ' attr attr/xattr support' + printf "%s\n" ' auth-pam PAM access control' + printf "%s\n" ' avx2 AVX2 optimizations' ++ printf "%s\n" ' avx512bw AVX512BW optimizations' + printf "%s\n" ' avx512f AVX512F optimizations' + printf "%s\n" ' bpf eBPF support' + printf "%s\n" ' brlapi brlapi character device driver' +@@ -111,6 +112,8 @@ _meson_option_parse() { + --disable-auth-pam) printf "%s" -Dauth_pam=disabled ;; + --enable-avx2) printf "%s" -Davx2=enabled ;; + --disable-avx2) printf "%s" -Davx2=disabled ;; ++ --enable-avx512bw) printf "%s" -Davx512bw=enabled ;; ++ --disable-avx512bw) printf "%s" -Davx512bw=disabled ;; + --enable-avx512f) printf "%s" -Davx512f=enabled ;; + --disable-avx512f) printf "%s" -Davx512f=disabled ;; + --enable-bpf) printf "%s" -Dbpf=enabled ;; +-- +2.33.0 + diff --git a/1108-Update-bench-code-for-addressing-CI-problem.patch b/1108-Update-bench-code-for-addressing-CI-problem.patch new file mode 100644 index 0000000000000000000000000000000000000000..bbbfe25495898d890c400b019a9ef86d31bdac19 --- /dev/null +++ b/1108-Update-bench-code-for-addressing-CI-problem.patch @@ -0,0 +1,608 @@ +From cf8eaf77dbc576dbfe72c7512c1c2c63c9523bb1 Mon Sep 17 00:00:00 2001 +From: ling xu +Date: Wed, 16 Nov 2022 23:29:23 +0800 +Subject: [PATCH 1108/1119] Update bench-code for addressing CI problem + +commit cc98c9fd5c17b8ab62ad91b183060d8f70b9d00d upstream. + +Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c +for performance benchmarking. we have modified xbzrle-bench.c to address +CI problem. + +Intel-SIG: commit cc98c9fd5c17 Update bench-code for addressing CI problem +Backport AVX512 support for xbzrle_encode_buffer. + +Signed-off-by: ling xu +Co-authored-by: Zhou Zhao +Co-authored-by: Jun Jin +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + tests/bench/meson.build | 6 + + tests/bench/xbzrle-bench.c | 469 +++++++++++++++++++++++++++++++++++++ + tests/unit/test-xbzrle.c | 39 ++- + 3 files changed, 509 insertions(+), 5 deletions(-) + create mode 100644 tests/bench/xbzrle-bench.c + +diff --git a/tests/bench/meson.build b/tests/bench/meson.build +index 00b3c209dcbd..54bc8938a8ad 100644 +--- a/tests/bench/meson.build ++++ b/tests/bench/meson.build +@@ -3,6 +3,12 @@ qht_bench = executable('qht-bench', + sources: 'qht-bench.c', + dependencies: [qemuutil]) + ++if have_system ++xbzrle_bench = executable('xbzrle-bench', ++ sources: 'xbzrle-bench.c', ++ dependencies: [qemuutil,migration]) ++endif ++ + executable('atomic_add-bench', + sources: files('atomic_add-bench.c'), + dependencies: [qemuutil], +diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c +new file mode 100644 +index 000000000000..8848a3a32d7e +--- /dev/null ++++ b/tests/bench/xbzrle-bench.c +@@ -0,0 +1,469 @@ ++/* ++ * Xor Based Zero Run Length Encoding unit tests. ++ * ++ * Copyright 2013 Red Hat, Inc. and/or its affiliates ++ * ++ * Authors: ++ * Orit Wasserman ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory. ++ * ++ */ ++#include "qemu/osdep.h" ++#include "qemu/cutils.h" ++#include "../migration/xbzrle.h" ++ ++#if defined(CONFIG_AVX512BW_OPT) ++#define XBZRLE_PAGE_SIZE 4096 ++static bool is_cpu_support_avx512bw; ++#include "qemu/cpuid.h" ++static void __attribute__((constructor)) init_cpu_flag(void) ++{ ++ unsigned max = __get_cpuid_max(0, NULL); ++ int a, b, c, d; ++ is_cpu_support_avx512bw = false; ++ if (max >= 1) { ++ __cpuid(1, a, b, c, d); ++ /* We must check that AVX is not just available, but usable. */ ++ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { ++ int bv; ++ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); ++ __cpuid_count(7, 0, a, b, c, d); ++ /* 0xe6: ++ * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 ++ * and ZMM16-ZMM31 state are enabled by OS) ++ * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) ++ */ ++ if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { ++ is_cpu_support_avx512bw = true; ++ } ++ } ++ } ++ return ; ++} ++ ++struct ResTime { ++ float t_raw; ++ float t_512; ++}; ++ ++ ++/* Function prototypes ++int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, ++ uint8_t *dst, int dlen); ++*/ ++static void encode_decode_zero(struct ResTime *res) ++{ ++ uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ int i = 0; ++ int dlen = 0, dlen512 = 0; ++ int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); ++ ++ for (i = diff_len; i > 0; i--) { ++ buffer[1000 + i] = i; ++ buffer512[1000 + i] = i; ++ } ++ ++ buffer[1000 + diff_len + 3] = 103; ++ buffer[1000 + diff_len + 5] = 105; ++ ++ buffer512[1000 + diff_len + 3] = 103; ++ buffer512[1000 + diff_len + 5] = 105; ++ ++ /* encode zero page */ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ g_assert(dlen == 0); ++ ++ t_start512 = clock(); ++ dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ g_assert(dlen512 == 0); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(buffer); ++ g_free(compressed); ++ g_free(buffer512); ++ g_free(compressed512); ++ ++} ++ ++static void test_encode_decode_zero_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_zero(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("Zero test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++ ++static void encode_decode_unchanged(struct ResTime *res) ++{ ++ uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ int i = 0; ++ int dlen = 0, dlen512 = 0; ++ int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); ++ ++ for (i = diff_len; i > 0; i--) { ++ test[1000 + i] = i + 4; ++ test512[1000 + i] = i + 4; ++ } ++ ++ test[1000 + diff_len + 3] = 107; ++ test[1000 + diff_len + 5] = 109; ++ ++ test512[1000 + diff_len + 3] = 107; ++ test512[1000 + diff_len + 5] = 109; ++ ++ /* test unchanged buffer */ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ g_assert(dlen == 0); ++ ++ t_start512 = clock(); ++ dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ g_assert(dlen512 == 0); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(test); ++ g_free(compressed); ++ g_free(test512); ++ g_free(compressed512); ++ ++} ++ ++static void test_encode_decode_unchanged_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_unchanged(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("Unchanged test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++ ++static void encode_decode_1_byte(struct ResTime *res) ++{ ++ uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); ++ int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0; ++ uint8_t buf[2]; ++ uint8_t buf512[2]; ++ ++ test[XBZRLE_PAGE_SIZE - 1] = 1; ++ test512[XBZRLE_PAGE_SIZE - 1] = 1; ++ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2)); ++ ++ rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE); ++ g_assert(rc == XBZRLE_PAGE_SIZE); ++ g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0); ++ ++ t_start512 = clock(); ++ dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2)); ++ ++ rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512, ++ XBZRLE_PAGE_SIZE); ++ g_assert(rc512 == XBZRLE_PAGE_SIZE); ++ g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(buffer); ++ g_free(compressed); ++ g_free(test); ++ g_free(buffer512); ++ g_free(compressed512); ++ g_free(test512); ++ ++} ++ ++static void test_encode_decode_1_byte_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_1_byte(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("1 byte test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++ ++static void encode_decode_overflow(struct ResTime *res) ++{ ++ uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ int i = 0, rc = 0, rc512 = 0; ++ ++ for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) { ++ test[i * 2] = 1; ++ test512[i * 2] = 1; ++ } ++ ++ /* encode overflow */ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ g_assert(rc == -1); ++ ++ t_start512 = clock(); ++ rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ g_assert(rc512 == -1); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(buffer); ++ g_free(compressed); ++ g_free(test); ++ g_free(buffer512); ++ g_free(compressed512); ++ g_free(test512); ++ ++} ++ ++static void test_encode_decode_overflow_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_overflow(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("Overflow test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++ ++static void encode_decode_range_avx512(struct ResTime *res) ++{ ++ uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); ++ uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); ++ uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ int i = 0, rc = 0, rc512 = 0; ++ int dlen = 0, dlen512 = 0; ++ ++ int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); ++ ++ for (i = diff_len; i > 0; i--) { ++ buffer[1000 + i] = i; ++ test[1000 + i] = i + 4; ++ buffer512[1000 + i] = i; ++ test512[1000 + i] = i + 4; ++ } ++ ++ buffer[1000 + diff_len + 3] = 103; ++ test[1000 + diff_len + 3] = 107; ++ ++ buffer[1000 + diff_len + 5] = 105; ++ test[1000 + diff_len + 5] = 109; ++ ++ buffer512[1000 + diff_len + 3] = 103; ++ test512[1000 + diff_len + 3] = 107; ++ ++ buffer512[1000 + diff_len + 5] = 105; ++ test512[1000 + diff_len + 5] = 109; ++ ++ /* test encode/decode */ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); ++ g_assert(rc < XBZRLE_PAGE_SIZE); ++ g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0); ++ ++ t_start512 = clock(); ++ dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE); ++ g_assert(rc512 < XBZRLE_PAGE_SIZE); ++ g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(buffer); ++ g_free(compressed); ++ g_free(test); ++ g_free(buffer512); ++ g_free(compressed512); ++ g_free(test512); ++ ++} ++ ++static void test_encode_decode_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_range_avx512(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("Encode decode test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++ ++static void encode_decode_random(struct ResTime *res) ++{ ++ uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); ++ uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE); ++ uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); ++ int i = 0, rc = 0, rc512 = 0; ++ int dlen = 0, dlen512 = 0; ++ ++ int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1); ++ /* store the index of diff */ ++ int dirty_index[diff_len]; ++ for (int j = 0; j < diff_len; j++) { ++ dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1); ++ } ++ for (i = diff_len - 1; i >= 0; i--) { ++ buffer[dirty_index[i]] = i; ++ test[dirty_index[i]] = i + 4; ++ buffer512[dirty_index[i]] = i; ++ test512[dirty_index[i]] = i + 4; ++ } ++ ++ time_t t_start, t_end, t_start512, t_end512; ++ t_start = clock(); ++ dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, ++ XBZRLE_PAGE_SIZE); ++ t_end = clock(); ++ float time_val = difftime(t_end, t_start); ++ rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); ++ g_assert(rc < XBZRLE_PAGE_SIZE); ++ ++ t_start512 = clock(); ++ dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE, ++ compressed512, XBZRLE_PAGE_SIZE); ++ t_end512 = clock(); ++ float time_val512 = difftime(t_end512, t_start512); ++ rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE); ++ g_assert(rc512 < XBZRLE_PAGE_SIZE); ++ ++ res->t_raw = time_val; ++ res->t_512 = time_val512; ++ ++ g_free(buffer); ++ g_free(compressed); ++ g_free(test); ++ g_free(buffer512); ++ g_free(compressed512); ++ g_free(test512); ++ ++} ++ ++static void test_encode_decode_random_avx512(void) ++{ ++ int i; ++ float time_raw = 0.0, time_512 = 0.0; ++ struct ResTime res; ++ for (i = 0; i < 10000; i++) { ++ encode_decode_random(&res); ++ time_raw += res.t_raw; ++ time_512 += res.t_512; ++ } ++ printf("Random test:\n"); ++ printf("Raw xbzrle_encode time is %f ms\n", time_raw); ++ printf("512 xbzrle_encode time is %f ms\n", time_512); ++} ++#endif ++ ++int main(int argc, char **argv) ++{ ++ g_test_init(&argc, &argv, NULL); ++ g_test_rand_int(); ++ #if defined(CONFIG_AVX512BW_OPT) ++ if (likely(is_cpu_support_avx512bw)) { ++ g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512); ++ g_test_add_func("/xbzrle/encode_decode_unchanged", ++ test_encode_decode_unchanged_avx512); ++ g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512); ++ g_test_add_func("/xbzrle/encode_decode_overflow", ++ test_encode_decode_overflow_avx512); ++ g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512); ++ g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512); ++ } ++ #endif ++ return g_test_run(); ++} +diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c +index 795d6f1cbabb..baa364b443b6 100644 +--- a/tests/unit/test-xbzrle.c ++++ b/tests/unit/test-xbzrle.c +@@ -17,6 +17,35 @@ + + #define XBZRLE_PAGE_SIZE 4096 + ++int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, ++ uint8_t *, int) = xbzrle_encode_buffer; ++#if defined(CONFIG_AVX512BW_OPT) ++#include "qemu/cpuid.h" ++static void __attribute__((constructor)) init_cpu_flag(void) ++{ ++ unsigned max = __get_cpuid_max(0, NULL); ++ int a, b, c, d; ++ if (max >= 1) { ++ __cpuid(1, a, b, c, d); ++ /* We must check that AVX is not just available, but usable. */ ++ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { ++ int bv; ++ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); ++ __cpuid_count(7, 0, a, b, c, d); ++ /* 0xe6: ++ * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 ++ * and ZMM16-ZMM31 state are enabled by OS) ++ * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) ++ */ ++ if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { ++ xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; ++ } ++ } ++ } ++ return ; ++} ++#endif ++ + static void test_uleb(void) + { + uint32_t i, val; +@@ -55,7 +84,7 @@ static void test_encode_decode_zero(void) + buffer[1000 + diff_len + 5] = 105; + + /* encode zero page */ +- dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, ++ dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + g_assert(dlen == 0); + +@@ -79,7 +108,7 @@ static void test_encode_decode_unchanged(void) + test[1000 + diff_len + 5] = 109; + + /* test unchanged buffer */ +- dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, ++ dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + g_assert(dlen == 0); + +@@ -97,7 +126,7 @@ static void test_encode_decode_1_byte(void) + + test[XBZRLE_PAGE_SIZE - 1] = 1; + +- dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, ++ dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2)); + +@@ -122,7 +151,7 @@ static void test_encode_decode_overflow(void) + } + + /* encode overflow */ +- rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed, ++ rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + g_assert(rc == -1); + +@@ -153,7 +182,7 @@ static void encode_decode_range(void) + test[1000 + diff_len + 5] = 109; + + /* test encode/decode */ +- dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed, ++ dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); + + rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE); +-- +2.33.0 + diff --git a/1109-migration-xbzrle-use-ctz64-to-avoid-undefined-result.patch b/1109-migration-xbzrle-use-ctz64-to-avoid-undefined-result.patch new file mode 100644 index 0000000000000000000000000000000000000000..225548bdbf4b2c92d196aadfa3c61e7a4a9ce9c2 --- /dev/null +++ b/1109-migration-xbzrle-use-ctz64-to-avoid-undefined-result.patch @@ -0,0 +1,63 @@ +From 4e5984b99f74ce99855d3b6bab561004fad0a9bd Mon Sep 17 00:00:00 2001 +From: Matheus Tavares Bernardino +Date: Mon, 13 Mar 2023 15:58:19 -0300 +Subject: [PATCH 1109/1119] migration/xbzrle: use ctz64 to avoid undefined + result + +commit d84a78d15d3af9ff28ceec6906a4b101bd545b55 upstream. + +__builtin_ctzll() produces undefined results when the argument is 0. +This can be seen through test-xbzrle, which produces the following +warning: + +../migration/xbzrle.c:265: runtime error: passing zero to ctz(), which is not a valid argument + +Replace __builtin_ctzll() with our ctz64() wrapper which properly +handles 0. + +Intel-SIG: commit d84a78d15d3a migration/xbzrle: use ctz64 to avoid undefined result +Backport AVX512 support for xbzrle_encode_buffer. + +Signed-off-by: Matheus Tavares Bernardino +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + migration/xbzrle.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/migration/xbzrle.c b/migration/xbzrle.c +index 05366e86c05a..21b92d4eae1c 100644 +--- a/migration/xbzrle.c ++++ b/migration/xbzrle.c +@@ -12,6 +12,7 @@ + */ + #include "qemu/osdep.h" + #include "qemu/cutils.h" ++#include "qemu/host-utils.h" + #include "xbzrle.h" + + /* +@@ -233,7 +234,7 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + break; + } + never_same = false; +- num = __builtin_ctzll(~comp); ++ num = ctz64(~comp); + num = (num < bytes_to_check) ? num : bytes_to_check; + zrun_len += num; + bytes_to_check -= num; +@@ -262,7 +263,7 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + nzrun_len += 64; + break; + } +- num = __builtin_ctzll(comp); ++ num = ctz64(comp); + num = (num < bytes_to_check) ? num : bytes_to_check; + nzrun_len += num; + bytes_to_check -= num; +-- +2.33.0 + diff --git a/1110-migration-xbzrle-fix-out-of-bounds-write-with-axv512.patch b/1110-migration-xbzrle-fix-out-of-bounds-write-with-axv512.patch new file mode 100644 index 0000000000000000000000000000000000000000..57f16f67f795bcd039c6a696639b4ec53aa32b8a --- /dev/null +++ b/1110-migration-xbzrle-fix-out-of-bounds-write-with-axv512.patch @@ -0,0 +1,75 @@ +From cadd3c92228445f6c313ddbb0eb63192ba5f650b Mon Sep 17 00:00:00 2001 +From: Matheus Tavares Bernardino +Date: Mon, 13 Mar 2023 15:58:20 -0300 +Subject: [PATCH 1110/1119] migration/xbzrle: fix out-of-bounds write with + axv512 + +commit 1776b70f55c75541e9cab3423650a59b085162a9 upstream. + +xbzrle_encode_buffer_avx512() checks for overflows too scarcely in its +outer loop, causing out-of-bounds writes: + +$ ../configure --target-list=aarch64-softmmu --enable-sanitizers --enable-avx512bw +$ make tests/unit/test-xbzrle && ./tests/unit/test-xbzrle + +==5518==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62100000b100 at pc 0x561109a7714d bp 0x7ffed712a440 sp 0x7ffed712a430 +WRITE of size 1 at 0x62100000b100 thread T0 + #0 0x561109a7714c in uleb128_encode_small ../util/cutils.c:831 + #1 0x561109b67f6a in xbzrle_encode_buffer_avx512 ../migration/xbzrle.c:275 + #2 0x5611099a7428 in test_encode_decode_overflow ../tests/unit/test-xbzrle.c:153 + #3 0x7fb2fb65a58d (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7a58d) + #4 0x7fb2fb65a333 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7a333) + #5 0x7fb2fb65aa79 in g_test_run_suite (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7aa79) + #6 0x7fb2fb65aa94 in g_test_run (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x7aa94) + #7 0x5611099a3a23 in main ../tests/unit/test-xbzrle.c:218 + #8 0x7fb2fa78c082 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x24082) + #9 0x5611099a608d in _start (/qemu/build/tests/unit/test-xbzrle+0x28408d) + +0x62100000b100 is located 0 bytes to the right of 4096-byte region [0x62100000a100,0x62100000b100) +allocated by thread T0 here: + #0 0x7fb2fb823a06 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cc:153 + #1 0x7fb2fb637ef0 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x57ef0) + +Fix that by performing the overflow check in the inner loop, instead. + +Intel-SIG: commit 1776b70f55c7 migration/xbzrle: fix out-of-bounds write with axv512 +Backport AVX512 support for xbzrle_encode_buffer. + +Signed-off-by: Matheus Tavares Bernardino +Reviewed-by: Dr. David Alan Gilbert +Reviewed-by: Juan Quintela +Signed-off-by: Juan Quintela +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + migration/xbzrle.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/migration/xbzrle.c b/migration/xbzrle.c +index 21b92d4eae1c..c6f8b209175a 100644 +--- a/migration/xbzrle.c ++++ b/migration/xbzrle.c +@@ -197,10 +197,6 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + __m512i r = _mm512_set1_epi32(0); + + while (count512s) { +- if (d + 2 > dlen) { +- return -1; +- } +- + int bytes_to_check = 64; + uint64_t mask = 0xffffffffffffffff; + if (count512s == 1) { +@@ -216,6 +212,9 @@ int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, + + bool is_same = (comp & 0x1); + while (bytes_to_check) { ++ if (d + 2 > dlen) { ++ return -1; ++ } + if (is_same) { + if (nzrun_len) { + d += uleb128_encode_small(dst + d, nzrun_len); +-- +2.33.0 + diff --git a/1111-target-i386-Export-GDS_NO-bit-to-guests.patch b/1111-target-i386-Export-GDS_NO-bit-to-guests.patch new file mode 100644 index 0000000000000000000000000000000000000000..eccd1b666df9f059fcb284eafe4479fcf64d56f1 --- /dev/null +++ b/1111-target-i386-Export-GDS_NO-bit-to-guests.patch @@ -0,0 +1,46 @@ +From bd99b7bd43204f88e54ae898b57f6ca17e510383 Mon Sep 17 00:00:00 2001 +From: Pawan Gupta +Date: Mon, 14 Aug 2023 21:54:27 -0700 +Subject: [PATCH 1111/1119] target/i386: Export GDS_NO bit to guests + +commit 3a2a1f97ea349745094e789e6b0768dbd92d0dcd upstream. + +Gather Data Sampling (GDS) is a side-channel attack using Gather +instructions. Some Intel processors will set ARCH_CAP_GDS_NO bit in +MSR IA32_ARCH_CAPABILITIES to report that they are not vulnerable to +GDS. + +Make this bit available to guests. + +Intel-SIG: commit 3a2a1f97ea34 target/i386: Export GDS_NO bit to guests +Backport to export GDS_NO bit to guests(CVE-2022-40982). + +Closes: https://lore.kernel.org/qemu-devel/CAMGffEmG6TNq0n3+4OJAgXc8J0OevY60KHZekXCBs3LoK9vehA@mail.gmail.com/ +Reported-by: Jack Wang +Signed-off-by: Pawan Gupta +Tested-by: Jack Wang +Tested-by: Daniel Sneddon +Message-ID: +Signed-off-by: Paolo Bonzini +[ Aichun Shi: amend commit log ] +Signed-off-by: Aichun Shi +--- + target/i386/cpu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 21300c2d1916..74a183ed9a96 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -1025,7 +1025,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no", + NULL, "fb-clear", NULL, NULL, + NULL, NULL, NULL, NULL, +- "pbrsb-no", NULL, NULL, "rfds-no", ++ "pbrsb-no", NULL, "gds-no", "rfds-no", + "rfds-clear", NULL, NULL, NULL, + }, + .msr = { +-- +2.33.0 + diff --git a/1112-target-i386-Add-kvm_get_one_msr-helper.patch b/1112-target-i386-Add-kvm_get_one_msr-helper.patch new file mode 100644 index 0000000000000000000000000000000000000000..3017c4ada5842b57a691dd8802729aaefad6feed --- /dev/null +++ b/1112-target-i386-Add-kvm_get_one_msr-helper.patch @@ -0,0 +1,123 @@ +From ab9d5ec6178e3238b94acbb4406473eba06e25af Mon Sep 17 00:00:00 2001 +From: Yang Weijiang +Date: Tue, 15 Feb 2022 14:52:53 -0500 +Subject: [PATCH 1112/1119] target/i386: Add kvm_get_one_msr helper + +commit 5a778a5f820fdd907b95e93560637a61f6ea3c71 upstream. + +When try to get one msr from KVM, I found there's no such kind of +existing interface while kvm_put_one_msr() is there. So here comes +the patch. It'll remove redundant preparation code before finally +call KVM_GET_MSRS IOCTL. + +No functional change intended. + +Intel-SIG: commit 5a778a5f820f target/i386: Add kvm_get_one_msr helper +Backport i386/cpu bugfixes + +Signed-off-by: Yang Weijiang +Message-Id: <20220215195258.29149-4-weijiang.yang@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 46 ++++++++++++++++++++++++------------------- + 1 file changed, 26 insertions(+), 20 deletions(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 11f2ca08ee3e..2fc4525d2b63 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -142,6 +142,7 @@ static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES]; + + #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */ + static RateLimit bus_lock_ratelimit_ctrl; ++static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value); + + int kvm_has_pit_state2(void) + { +@@ -212,28 +213,21 @@ static int kvm_get_tsc(CPUState *cs) + { + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env = &cpu->env; +- struct { +- struct kvm_msrs info; +- struct kvm_msr_entry entries[1]; +- } msr_data = {}; ++ uint64_t value; + int ret; + + if (env->tsc_valid) { + return 0; + } + +- memset(&msr_data, 0, sizeof(msr_data)); +- msr_data.info.nmsrs = 1; +- msr_data.entries[0].index = MSR_IA32_TSC; + env->tsc_valid = !runstate_is_running(); + +- ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); ++ ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value); + if (ret < 0) { + return ret; + } + +- assert(ret == 1); +- env->tsc = msr_data.entries[0].data; ++ env->tsc = value; + return 0; + } + +@@ -1529,21 +1523,14 @@ static int hyperv_init_vcpu(X86CPU *cpu) + * the kernel doesn't support setting vp_index; assert that its value + * is in sync + */ +- struct { +- struct kvm_msrs info; +- struct kvm_msr_entry entries[1]; +- } msr_data = { +- .info.nmsrs = 1, +- .entries[0].index = HV_X64_MSR_VP_INDEX, +- }; ++ uint64_t value; + +- ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data); ++ ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value); + if (ret < 0) { + return ret; + } +- assert(ret == 1); + +- if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) { ++ if (value != hyperv_vp_index(CPU(cpu))) { + error_report("kernel's vp_index != QEMU's vp_index"); + return -ENXIO; + } +@@ -2857,6 +2844,25 @@ static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value) + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); + } + ++static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value) ++{ ++ int ret; ++ struct { ++ struct kvm_msrs info; ++ struct kvm_msr_entry entries[1]; ++ } msr_data = { ++ .info.nmsrs = 1, ++ .entries[0].index = index, ++ }; ++ ++ ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); ++ if (ret < 0) { ++ return ret; ++ } ++ assert(ret == 1); ++ *value = msr_data.entries[0].data; ++ return ret; ++} + void kvm_put_apicbase(X86CPU *cpu, uint64_t value) + { + int ret; +-- +2.33.0 + diff --git a/1113-target-i386-Enable-support-for-XSAVES-based-features.patch b/1113-target-i386-Enable-support-for-XSAVES-based-features.patch new file mode 100644 index 0000000000000000000000000000000000000000..5f797ca63b8e9b3d1ceb72966786ea608c449aa4 --- /dev/null +++ b/1113-target-i386-Enable-support-for-XSAVES-based-features.patch @@ -0,0 +1,287 @@ +From 92a93c0d59ee1ad737470f5f5426d8548cd209c8 Mon Sep 17 00:00:00 2001 +From: Yang Weijiang +Date: Tue, 15 Feb 2022 14:52:54 -0500 +Subject: [PATCH 1113/1119] target/i386: Enable support for XSAVES based + features + +commit 301e90675c3fed6cdc48682021a1ab42bc0e0d76 upstream. + +There're some new features, including Arch LBR, depending +on XSAVES/XRSTORS support, the new instructions will +save/restore data based on feature bits enabled in XCR0 | XSS. +This patch adds the basic support for related CPUID enumeration +and meanwhile changes the name from FEAT_XSAVE_COMP_{LO|HI} to +FEAT_XSAVE_XCR0_{LO|HI} to differentiate clearly the feature +bits in XCR0 and those in XSS. + +Intel-SIG: commit 301e90675c3f target/i386: Enable support for XSAVES based features +Backport i386/cpu bugfixes + +Signed-off-by: Yang Weijiang +Message-Id: <20220215195258.29149-5-weijiang.yang@intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 104 +++++++++++++++++++++++++++++++++++----------- + target/i386/cpu.h | 14 ++++++- + 2 files changed, 92 insertions(+), 26 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 74a183ed9a96..7f7143aa2959 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -978,6 +978,34 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + }, + .tcg_features = TCG_XSAVE_FEATURES, + }, ++ [FEAT_XSAVE_XSS_LO] = { ++ .type = CPUID_FEATURE_WORD, ++ .feat_names = { ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, NULL, ++ }, ++ .cpuid = { ++ .eax = 0xD, ++ .needs_ecx = true, ++ .ecx = 1, ++ .reg = R_ECX, ++ }, ++ }, ++ [FEAT_XSAVE_XSS_HI] = { ++ .type = CPUID_FEATURE_WORD, ++ .cpuid = { ++ .eax = 0xD, ++ .needs_ecx = true, ++ .ecx = 1, ++ .reg = R_EDX ++ }, ++ }, + [FEAT_6_EAX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { +@@ -993,7 +1021,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + .cpuid = { .eax = 6, .reg = R_EAX, }, + .tcg_features = TCG_6_EAX_FEATURES, + }, +- [FEAT_XSAVE_COMP_LO] = { ++ [FEAT_XSAVE_XCR0_LO] = { + .type = CPUID_FEATURE_WORD, + .cpuid = { + .eax = 0xD, +@@ -1006,7 +1034,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { + XSTATE_OPMASK_MASK | XSTATE_ZMM_Hi256_MASK | XSTATE_Hi16_ZMM_MASK | + XSTATE_PKRU_MASK, + }, +- [FEAT_XSAVE_COMP_HI] = { ++ [FEAT_XSAVE_XCR0_HI] = { + .type = CPUID_FEATURE_WORD, + .cpuid = { + .eax = 0xD, +@@ -1431,6 +1459,9 @@ static const X86RegisterInfo32 x86_reg_info_32[CPU_NB_REGS32] = { + }; + #undef REGISTER + ++/* CPUID feature bits available in XSS */ ++#define CPUID_XSTATE_XSS_MASK (0) ++ + ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = { + [XSTATE_FP_BIT] = { + /* x87 FP state component is always enabled if XSAVE is supported */ +@@ -1473,15 +1504,18 @@ ExtSaveArea x86_ext_save_areas[XSAVE_STATE_AREA_COUNT] = { + }, + }; + +-static uint32_t xsave_area_size(uint64_t mask) ++static uint32_t xsave_area_size(uint64_t mask, bool compacted) + { ++ uint64_t ret = x86_ext_save_areas[0].size; ++ const ExtSaveArea *esa; ++ uint32_t offset = 0; + int i; +- uint64_t ret = 0; + +- for (i = 0; i < ARRAY_SIZE(x86_ext_save_areas); i++) { +- const ExtSaveArea *esa = &x86_ext_save_areas[i]; ++ for (i = 2; i < ARRAY_SIZE(x86_ext_save_areas); i++) { ++ esa = &x86_ext_save_areas[i]; + if ((mask >> i) & 1) { +- ret = MAX(ret, esa->offset + esa->size); ++ offset = compacted ? ret : esa->offset; ++ ret = MAX(ret, offset + esa->size); + } + } + return ret; +@@ -1492,10 +1526,10 @@ static inline bool accel_uses_host_cpuid(void) + return kvm_enabled() || hvf_enabled(); + } + +-static inline uint64_t x86_cpu_xsave_components(X86CPU *cpu) ++static inline uint64_t x86_cpu_xsave_xcr0_components(X86CPU *cpu) + { +- return ((uint64_t)cpu->env.features[FEAT_XSAVE_COMP_HI]) << 32 | +- cpu->env.features[FEAT_XSAVE_COMP_LO]; ++ return ((uint64_t)cpu->env.features[FEAT_XSAVE_XCR0_HI]) << 32 | ++ cpu->env.features[FEAT_XSAVE_XCR0_LO]; + } + + /* Return name of 32-bit register, from a R_* constant */ +@@ -1507,6 +1541,12 @@ static const char *get_register_name_32(unsigned int reg) + return x86_reg_info_32[reg].name; + } + ++static inline uint64_t x86_cpu_xsave_xss_components(X86CPU *cpu) ++{ ++ return ((uint64_t)cpu->env.features[FEAT_XSAVE_XSS_HI]) << 32 | ++ cpu->env.features[FEAT_XSAVE_XSS_LO]; ++} ++ + /* + * Returns the set of feature flags that are supported and migratable by + * QEMU, for a given FeatureWord. +@@ -5206,8 +5246,8 @@ static const char *x86_cpu_feature_name(FeatureWord w, int bitnr) + /* XSAVE components are automatically enabled by other features, + * so return the original feature name instead + */ +- if (w == FEAT_XSAVE_COMP_LO || w == FEAT_XSAVE_COMP_HI) { +- int comp = (w == FEAT_XSAVE_COMP_HI) ? bitnr + 32 : bitnr; ++ if (w == FEAT_XSAVE_XCR0_LO || w == FEAT_XSAVE_XCR0_HI) { ++ int comp = (w == FEAT_XSAVE_XCR0_HI) ? bitnr + 32 : bitnr; + + if (comp < ARRAY_SIZE(x86_ext_save_areas) && + x86_ext_save_areas[comp].bits) { +@@ -6077,25 +6117,36 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + } + + if (count == 0) { +- *ecx = xsave_area_size(x86_cpu_xsave_components(cpu)); +- *eax = env->features[FEAT_XSAVE_COMP_LO]; +- *edx = env->features[FEAT_XSAVE_COMP_HI]; ++ *ecx = xsave_area_size(x86_cpu_xsave_xcr0_components(cpu), false); ++ *eax = env->features[FEAT_XSAVE_XCR0_LO]; ++ *edx = env->features[FEAT_XSAVE_XCR0_HI]; + /* + * The initial value of xcr0 and ebx == 0, On host without kvm + * commit 412a3c41(e.g., CentOS 6), the ebx's value always == 0 + * even through guest update xcr0, this will crash some legacy guest + * (e.g., CentOS 6), So set ebx == ecx to workaroud it. + */ +- *ebx = kvm_enabled() ? *ecx : xsave_area_size(env->xcr0); ++ *ebx = kvm_enabled() ? *ecx : xsave_area_size(env->xcr0, false); + } else if (count == 1) { ++ uint64_t xstate = x86_cpu_xsave_xcr0_components(cpu) | ++ x86_cpu_xsave_xss_components(cpu); ++ + *eax = env->features[FEAT_XSAVE]; ++ *ebx = xsave_area_size(xstate, true); ++ *ecx = env->features[FEAT_XSAVE_XSS_LO]; ++ *edx = env->features[FEAT_XSAVE_XSS_HI]; + } else if (count < ARRAY_SIZE(x86_ext_save_areas)) { +- if ((x86_cpu_xsave_components(cpu) >> count) & 1) { +- const ExtSaveArea *esa = &x86_ext_save_areas[count]; ++ const ExtSaveArea *esa = &x86_ext_save_areas[count]; ++ ++ if (x86_cpu_xsave_xcr0_components(cpu) & (1ULL << count)) { + *eax = esa->size; + *ebx = esa->offset; + *ecx = esa->ecx & + (ESA_FEATURE_ALIGN64_MASK | ESA_FEATURE_XFD_MASK); ++ } else if (x86_cpu_xsave_xss_components(cpu) & (1ULL << count)) { ++ *eax = esa->size; ++ *ebx = 0; ++ *ecx = 1; + } + } + break; +@@ -6146,8 +6197,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + } else { + *eax &= env->features[FEAT_SGX_12_1_EAX]; + *ebx &= 0; /* ebx reserve */ +- *ecx &= env->features[FEAT_XSAVE_COMP_LO]; +- *edx &= env->features[FEAT_XSAVE_COMP_HI]; ++ *ecx &= env->features[FEAT_XSAVE_XSS_LO]; ++ *edx &= env->features[FEAT_XSAVE_XSS_HI]; + + /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ + *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; +@@ -6544,6 +6595,9 @@ static void x86_cpu_reset(DeviceState *dev) + } + for (i = 2; i < ARRAY_SIZE(x86_ext_save_areas); i++) { + const ExtSaveArea *esa = &x86_ext_save_areas[i]; ++ if (!((1 << i) & CPUID_XSTATE_XCR0_MASK)) { ++ continue; ++ } + if (env->features[esa->feature] & esa->bits) { + xcr0 |= 1ull << i; + } +@@ -6661,8 +6715,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + static bool request_perm; + + if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { +- env->features[FEAT_XSAVE_COMP_LO] = 0; +- env->features[FEAT_XSAVE_COMP_HI] = 0; ++ env->features[FEAT_XSAVE_XCR0_LO] = 0; ++ env->features[FEAT_XSAVE_XCR0_HI] = 0; + return; + } + +@@ -6680,8 +6734,10 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + request_perm = true; + } + +- env->features[FEAT_XSAVE_COMP_LO] = mask; +- env->features[FEAT_XSAVE_COMP_HI] = mask >> 32; ++ env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; ++ env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; ++ env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; ++ env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; + } + + /***** Steps involved on loading and filtering CPUID data +diff --git a/target/i386/cpu.h b/target/i386/cpu.h +index c415bb3db890..b0d79a1519b5 100644 +--- a/target/i386/cpu.h ++++ b/target/i386/cpu.h +@@ -566,6 +566,14 @@ typedef enum X86Seg { + #define ESA_FEATURE_XFD_MASK (1U << ESA_FEATURE_XFD_BIT) + + ++/* CPUID feature bits available in XCR0 */ ++#define CPUID_XSTATE_XCR0_MASK (XSTATE_FP_MASK | XSTATE_SSE_MASK | \ ++ XSTATE_YMM_MASK | XSTATE_BNDREGS_MASK | \ ++ XSTATE_BNDCSR_MASK | XSTATE_OPMASK_MASK | \ ++ XSTATE_ZMM_Hi256_MASK | \ ++ XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK | \ ++ XSTATE_XTILE_CFG_MASK | XSTATE_XTILE_DATA_MASK) ++ + /* CPUID feature words */ + typedef enum FeatureWord { + FEAT_1_EDX, /* CPUID[1].EDX */ +@@ -584,8 +592,8 @@ typedef enum FeatureWord { + FEAT_SVM, /* CPUID[8000_000A].EDX */ + FEAT_XSAVE, /* CPUID[EAX=0xd,ECX=1].EAX */ + FEAT_6_EAX, /* CPUID[6].EAX */ +- FEAT_XSAVE_COMP_LO, /* CPUID[EAX=0xd,ECX=0].EAX */ +- FEAT_XSAVE_COMP_HI, /* CPUID[EAX=0xd,ECX=0].EDX */ ++ FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */ ++ FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */ + FEAT_ARCH_CAPABILITIES, + FEAT_CORE_CAPABILITY, + FEAT_PERF_CAPABILITIES, +@@ -602,6 +610,8 @@ typedef enum FeatureWord { + FEAT_SGX_12_0_EAX, /* CPUID[EAX=0x12,ECX=0].EAX (SGX) */ + FEAT_SGX_12_0_EBX, /* CPUID[EAX=0x12,ECX=0].EBX (SGX MISCSELECT[31:0]) */ + FEAT_SGX_12_1_EAX, /* CPUID[EAX=0x12,ECX=1].EAX (SGX ATTRIBUTES[31:0]) */ ++ FEAT_XSAVE_XSS_LO, /* CPUID[EAX=0xd,ECX=1].ECX */ ++ FEAT_XSAVE_XSS_HI, /* CPUID[EAX=0xd,ECX=1].EDX */ + FEAT_7_1_EDX, /* CPUID[EAX=7,ECX=1].EDX */ + FEAT_7_2_EDX, /* CPUID[EAX=7,ECX=2].EDX */ + FEATURE_WORDS, +-- +2.33.0 + diff --git a/1114-target-i386-Change-wrong-XFRM-value-in-SGX-CPUID-lea.patch b/1114-target-i386-Change-wrong-XFRM-value-in-SGX-CPUID-lea.patch new file mode 100644 index 0000000000000000000000000000000000000000..046614be8b2e70833d9687c50427ead9d606f617 --- /dev/null +++ b/1114-target-i386-Change-wrong-XFRM-value-in-SGX-CPUID-lea.patch @@ -0,0 +1,47 @@ +From 93ab0eb2840ac864df58e70388586df8fb37ccc6 Mon Sep 17 00:00:00 2001 +From: Yang Zhong +Date: Thu, 6 Apr 2023 02:40:41 -0400 +Subject: [PATCH 1114/1119] target/i386: Change wrong XFRM value in SGX CPUID + leaf +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 72497cff896fecf74306ed33626c30e43633cdd6 upstream. + +The previous patch wrongly replaced FEAT_XSAVE_XCR0_{LO|HI} with +FEAT_XSAVE_XSS_{LO|HI} in CPUID(EAX=12,ECX=1):{ECX,EDX}. As a result, +SGX enclaves only supported SSE and x87 feature (xfrm=0x3). + +Intel-SIG: commit 72497cff896f target/i386: Change wrong XFRM value in SGX CPUID leaf +Backport i386/cpu bugfixes + +Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") +Signed-off-by: Yang Zhong +Reviewed-by: Yang Weijiang +Reviewed-by: Kai Huang +Message-Id: <20230406064041.420039-1-yang.zhong@linux.intel.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 7f7143aa2959..ddd5e3029438 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6197,8 +6197,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, + } else { + *eax &= env->features[FEAT_SGX_12_1_EAX]; + *ebx &= 0; /* ebx reserve */ +- *ecx &= env->features[FEAT_XSAVE_XSS_LO]; +- *edx &= env->features[FEAT_XSAVE_XSS_HI]; ++ *ecx &= env->features[FEAT_XSAVE_XCR0_LO]; ++ *edx &= env->features[FEAT_XSAVE_XCR0_HI]; + + /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ + *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; +-- +2.33.0 + diff --git a/1115-i386-cpu-Clear-FEAT_XSAVE_XSS_LO-HI-leafs-when-CPUID.patch b/1115-i386-cpu-Clear-FEAT_XSAVE_XSS_LO-HI-leafs-when-CPUID.patch new file mode 100644 index 0000000000000000000000000000000000000000..99c8c2764bab7be48e5947e3e3f213f991d81922 --- /dev/null +++ b/1115-i386-cpu-Clear-FEAT_XSAVE_XSS_LO-HI-leafs-when-CPUID.patch @@ -0,0 +1,41 @@ +From a21d0aff7e8c828ff2d07220a210c84d902e4429 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Mon, 15 Jan 2024 04:13:24 -0500 +Subject: [PATCH 1115/1119] i386/cpu: Clear FEAT_XSAVE_XSS_LO/HI leafs when + CPUID_EXT_XSAVE is not available + +commit 81f5cad3858f27623b1b14467926032d229b76cc upstream. + +Leaf FEAT_XSAVE_XSS_LO and FEAT_XSAVE_XSS_HI also need to be cleared +when CPUID_EXT_XSAVE is not set. + +Intel-SIG: commit 81f5cad3858f i386/cpu: Clear FEAT_XSAVE_XSS_LO/HI leafs when CPUID_EXT_XSAVE is not available +Backport i386/cpu bugfixes + +Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") +Signed-off-by: Xiaoyao Li +Reviewed-by: Yang Weijiang +Message-ID: <20240115091325.1904229-2-xiaoyao.li@intel.com> +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index ddd5e3029438..23dbaccb8344 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6717,6 +6717,8 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + if (!(env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE)) { + env->features[FEAT_XSAVE_XCR0_LO] = 0; + env->features[FEAT_XSAVE_XCR0_HI] = 0; ++ env->features[FEAT_XSAVE_XSS_LO] = 0; ++ env->features[FEAT_XSAVE_XSS_HI] = 0; + return; + } + +-- +2.33.0 + diff --git a/1116-i386-cpu-Mask-with-XCR0-XSS-mask-for-FEAT_XSAVE_XCR0.patch b/1116-i386-cpu-Mask-with-XCR0-XSS-mask-for-FEAT_XSAVE_XCR0.patch new file mode 100644 index 0000000000000000000000000000000000000000..9a947a1740fe0c8ef0a0ddc21bb6ab193858337d --- /dev/null +++ b/1116-i386-cpu-Mask-with-XCR0-XSS-mask-for-FEAT_XSAVE_XCR0.patch @@ -0,0 +1,45 @@ +From 537d7d94c791bc2a97cea20436e9c0ad92cb3018 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Mon, 15 Jan 2024 04:13:25 -0500 +Subject: [PATCH 1116/1119] i386/cpu: Mask with XCR0/XSS mask for + FEAT_XSAVE_XCR0_HI and FEAT_XSAVE_XSS_HI leafs + +commit a11a365159b944e05be76f3ec3b98c8b38cb70fd upstream. + +The value of FEAT_XSAVE_XCR0_HI leaf and FEAT_XSAVE_XSS_HI leaf also +need to be masked by XCR0 and XSS mask respectively, to make it +logically correct. + +Intel-SIG: commit a11a365159b9 i386/cpu: Mask with XCR0/XSS mask for FEAT_XSAVE_XCR0_HI and FEAT_XSAVE_XSS_HI leafs +Backport i36/cpu bugfixes + +Fixes: 301e90675c3f ("target/i386: Enable support for XSAVES based features") +Signed-off-by: Xiaoyao Li +Reviewed-by: Yang Weijiang +Message-ID: <20240115091325.1904229-3-xiaoyao.li@intel.com> +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/cpu.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index 23dbaccb8344..f151a5a403aa 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -6737,9 +6737,9 @@ static void x86_cpu_enable_xsave_components(X86CPU *cpu) + } + + env->features[FEAT_XSAVE_XCR0_LO] = mask & CPUID_XSTATE_XCR0_MASK; +- env->features[FEAT_XSAVE_XCR0_HI] = mask >> 32; ++ env->features[FEAT_XSAVE_XCR0_HI] = (mask & CPUID_XSTATE_XCR0_MASK) >> 32; + env->features[FEAT_XSAVE_XSS_LO] = mask & CPUID_XSTATE_XSS_MASK; +- env->features[FEAT_XSAVE_XSS_HI] = mask >> 32; ++ env->features[FEAT_XSAVE_XSS_HI] = (mask & CPUID_XSTATE_XSS_MASK) >> 32; + } + + /***** Steps involved on loading and filtering CPUID data +-- +2.33.0 + diff --git a/1117-i386-cpuid-Decrease-cpuid_i-when-skipping-CPUID-leaf.patch b/1117-i386-cpuid-Decrease-cpuid_i-when-skipping-CPUID-leaf.patch new file mode 100644 index 0000000000000000000000000000000000000000..05f2465e70fcb9268f3779220d6e5dfb0da3c840 --- /dev/null +++ b/1117-i386-cpuid-Decrease-cpuid_i-when-skipping-CPUID-leaf.patch @@ -0,0 +1,42 @@ +From cc367cb8ad5a89182eef2503d25f6506768fc51b Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 24 Jan 2024 21:40:14 -0500 +Subject: [PATCH 1117/1119] i386/cpuid: Decrease cpuid_i when skipping CPUID + leaf 1F + +commit 10f92799af8ba3c3cef2352adcd4780f13fbab31 upstream. + +Existing code misses a decrement of cpuid_i when skip leaf 0x1F. +There's a blank CPUID entry(with leaf, subleaf as 0, and all fields +stuffed 0s) left in the CPUID array. + +It conflicts with correct CPUID leaf 0. + +Intel-SIG: commit 10f92799af8b i386/cpuid: Decrease cpuid_i when skipping CPUID leaf 1F +Backport i386/cpu bugfixes + +Signed-off-by: Xiaoyao Li +Reviewed-by:Yang Weijiang +Message-ID: <20240125024016.2521244-2-xiaoyao.li@intel.com> +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 2fc4525d2b63..3c0aa7e80b99 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1783,6 +1783,7 @@ int kvm_arch_init_vcpu(CPUState *cs) + } + case 0x1f: + if (env->nr_dies < 2) { ++ cpuid_i--; + break; + } + /* fallthrough */ +-- +2.33.0 + diff --git a/1118-i386-cpuid-Move-leaf-7-to-correct-group.patch b/1118-i386-cpuid-Move-leaf-7-to-correct-group.patch new file mode 100644 index 0000000000000000000000000000000000000000..f0f652e962e1acf5ffdc8fbbf4c60c80f1dbd9a4 --- /dev/null +++ b/1118-i386-cpuid-Move-leaf-7-to-correct-group.patch @@ -0,0 +1,53 @@ +From f701161f04b084a851b699ebec5faf2d46728833 Mon Sep 17 00:00:00 2001 +From: Xiaoyao Li +Date: Wed, 24 Jan 2024 21:40:16 -0500 +Subject: [PATCH 1118/1119] i386/cpuid: Move leaf 7 to correct group + +commit 0729857c707535847d7fe31d3d91eb8b2a118e3c upstream. + +CPUID leaf 7 was grouped together with SGX leaf 0x12 by commit +b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") by mistake. + +SGX leaf 0x12 has its specific logic to check if subleaf (starting from 2) +is valid or not by checking the bit 0:3 of corresponding EAX is 1 or +not. + +Leaf 7 follows the logic that EAX of subleaf 0 enumerates the maximum +valid subleaf. + +Intel-SIG: commit 0729857c7075 i386/cpuid: Move leaf 7 to correct group +Backport i386/cpu bugfixes + +Fixes: b9edbadefb9e ("i386: Propagate SGX CPUID sub-leafs to KVM") +Signed-off-by: Xiaoyao Li +Message-ID: <20240125024016.2521244-4-xiaoyao.li@intel.com> +Cc: qemu-stable@nongnu.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Jason Zeng +--- + target/i386/kvm/kvm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c +index 3c0aa7e80b99..37d6632d7711 100644 +--- a/target/i386/kvm/kvm.c ++++ b/target/i386/kvm/kvm.c +@@ -1824,7 +1824,6 @@ int kvm_arch_init_vcpu(CPUState *cs) + c = &cpuid_data.entries[cpuid_i++]; + } + break; +- case 0x7: + case 0x12: + for (j = 0; ; j++) { + c->function = i; +@@ -1844,6 +1843,7 @@ int kvm_arch_init_vcpu(CPUState *cs) + c = &cpuid_data.entries[cpuid_i++]; + } + break; ++ case 0x7: + case 0x14: + case 0x1d: + case 0x1e: { +-- +2.33.0 + diff --git a/1119-target-i386-Introduce-Icelake-Server-v7-to-enable-TS.patch b/1119-target-i386-Introduce-Icelake-Server-v7-to-enable-TS.patch new file mode 100644 index 0000000000000000000000000000000000000000..d3b5432a55de9675ec25088782aac87b67d9344e --- /dev/null +++ b/1119-target-i386-Introduce-Icelake-Server-v7-to-enable-TS.patch @@ -0,0 +1,65 @@ +From f0442aeede3c2e1c02d3e61c30e136a59716dacc Mon Sep 17 00:00:00 2001 +From: Zhenzhong Duan +Date: Wed, 20 Mar 2024 17:31:38 +0800 +Subject: [PATCH 1119/1119] target/i386: Introduce Icelake-Server-v7 to enable + TSX + +commit c895fa54e3060c5ac6f3888dce96c9b78626072b upstream. + +When start L2 guest with both L1/L2 using Icelake-Server-v3 or above, +QEMU reports below warning: + +"warning: host doesn't support requested feature: MSR(10AH).taa-no [bit 8]" + +Reason is QEMU Icelake-Server-v3 has TSX feature disabled but enables taa-no +bit. It's meaningless that TSX isn't supported but still claim TSX is secure. +So L1 KVM doesn't expose taa-no to L2 if TSX is unsupported, then starting L2 +triggers the warning. + +Fix it by introducing a new version Icelake-Server-v7 which has both TSX +and taa-no features. Then guest can use TSX securely when it see taa-no. + +This matches the production Icelake which supports TSX and isn't susceptible +to TSX Async Abort (TAA) vulnerabilities, a.k.a, taa-no. + +Ideally, TSX should have being enabled together with taa-no since v3, but for +compatibility, we'd better to add v7 to enable it. + +Fixes: d965dc35592d ("target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model") + +Intel-SIG: commit c895fa54e306 target/i386: Introduce Icelake-Server-v7 to enable TSX. + +Tested-by: Xiangfei Ma +Signed-off-by: Zhenzhong Duan +Message-ID: <20240320093138.80267-2-zhenzhong.duan@intel.com> +Signed-off-by: Paolo Bonzini +[ Quanxian Wang: amend commit log ] +Signed-off-by: Quanxian Wang +--- + target/i386/cpu.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/target/i386/cpu.c b/target/i386/cpu.c +index f151a5a403aa..923fa99b6e06 100644 +--- a/target/i386/cpu.c ++++ b/target/i386/cpu.c +@@ -3673,6 +3673,16 @@ static const X86CPUDefinition builtin_x86_defs[] = { + { /* end of list */ } + }, + }, ++ { ++ .version = 7, ++ .note = "TSX, taa-no", ++ .props = (PropValue[]) { ++ /* Restore TSX features removed by -v2 above */ ++ { "hle", "on" }, ++ { "rtm", "on" }, ++ { /* end of list */ } ++ }, ++ }, + { /* end of list */ } + } + }, +-- +2.33.0 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index b7a53c33b9716c3fdd60c0d085e5485d38b9091e..f0f10d2ca52dd8437e3692f5aebad1e4713111ca 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -1002,6 +1002,29 @@ Patch1093: 1093-target-i386-Add-few-security-fix-bits-in-ARCH_CAPABI.patch Patch1094: 1094-target-i386-Introduce-SapphireRapids-v3-to-add-missi.patch Patch1095: 1095-ebpf-replace-deprecated-bpf_program__set_socket_filt.patch Patch1096: 1096-target-i386-Export-MSR_ARCH_CAPABILITIES-bits-to-gue.patch +Patch1097: 1097-Update-linux-headers-to-v6.0-rc4.patch +Patch1098: 1098-i386-kvm-extend-kvm_-get-put-_vcpu_events-to-support.patch +Patch1099: 1099-kvm-allow-target-specific-accelerator-properties.patch +Patch1100: 1100-kvm-expose-struct-KVMState.patch +Patch1101: 1101-i386-add-notify-VM-exit-support.patch +Patch1102: 1102-target-i386-Fix-sanity-check-on-max-APIC-ID-X2APIC-e.patch +Patch1103: 1103-target-i386-Set-maximum-APIC-ID-to-KVM-prior-to-vCPU.patch +Patch1104: 1104-target-i386-Add-SGX-aex-notify-and-EDECCSSA-support.patch +Patch1105: 1105-target-i386-KVM-allow-fast-string-operations-if-host.patch +Patch1106: 1106-configure-meson-move-AVX-tests-to-meson.patch +Patch1107: 1107-AVX512-support-for-xbzrle_encode_buffer.patch +Patch1108: 1108-Update-bench-code-for-addressing-CI-problem.patch +Patch1109: 1109-migration-xbzrle-use-ctz64-to-avoid-undefined-result.patch +Patch1110: 1110-migration-xbzrle-fix-out-of-bounds-write-with-axv512.patch +Patch1111: 1111-target-i386-Export-GDS_NO-bit-to-guests.patch +Patch1112: 1112-target-i386-Add-kvm_get_one_msr-helper.patch +Patch1113: 1113-target-i386-Enable-support-for-XSAVES-based-features.patch +Patch1114: 1114-target-i386-Change-wrong-XFRM-value-in-SGX-CPUID-lea.patch +Patch1115: 1115-i386-cpu-Clear-FEAT_XSAVE_XSS_LO-HI-leafs-when-CPUID.patch +Patch1116: 1116-i386-cpu-Mask-with-XCR0-XSS-mask-for-FEAT_XSAVE_XCR0.patch +Patch1117: 1117-i386-cpuid-Decrease-cpuid_i-when-skipping-CPUID-leaf.patch +Patch1118: 1118-i386-cpuid-Move-leaf-7-to-correct-group.patch +Patch1119: 1119-target-i386-Introduce-Icelake-Server-v7-to-enable-TS.patch BuildRequires: wget BuildRequires: rpm-build @@ -2279,6 +2302,7 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : - Add Hygon Dhyana-v3 and Dharma CPU model (zhouyanjing@hygon.cn) - Intel-SIG: Supprt Intel SPR/GNR/SRF new ISAs and cpu models (quanxian.wang@intel.com) - Intel-SIG: Supprt new SPR models (quanxian.wang@intel.com) +- Intel-SIG: Sync more intel platform related features/fixes from upstream (jason.zeng@intel.com) * Tue Oct 15 2024 Jon Maloy - 6.2.0-53.el8.2 - kvm-Fix-thread-pool-size-default-value-in-the-man-page.patch [RHEL-26197]