From aeaa6f6714312cd9a49793b9d966538b68a32486 Mon Sep 17 00:00:00 2001 From: leoliu-oc Date: Tue, 16 Jan 2024 15:17:23 +0800 Subject: [PATCH 1/2] anolis: x86/delay: add support for Zhaoxin ZXPAUSE instruction ANBZ: #7918 ZXPAUSE instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt. ZXPAUSE is available on processors with X86_FEATURE_ZXPAUSE. ZXPAUSE allows the processor to enter a light-weight power/performance optimized state (C0.1 state) for a period specified by the instruction or until the system time limit. MSR_ZX_PAUSE_CONTROL MSR register allows the OS to enable/disable C0.2 on the processor and to set the maximum time the processor can reside in C0.1 or C0.2. By default C0.2 is disabled. A sysfs interface to adjust the time and the C0.2 enablement is provided in a follow up change. Signed-off-by: leoliu-oc --- arch/x86/include/asm/cpufeature.h | 7 +- arch/x86/include/asm/cpufeatures.h | 5 +- arch/x86/include/asm/delay.h | 1 + arch/x86/include/asm/disabled-features.h | 3 +- arch/x86/include/asm/msr-index.h | 11 ++ arch/x86/include/asm/mwait.h | 22 +++ arch/x86/include/asm/required-features.h | 3 +- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/centaur.c | 4 + arch/x86/kernel/cpu/zhaoxin.c | 3 + arch/x86/kernel/cpu/zxpause.c | 238 +++++++++++++++++++++++ arch/x86/kernel/time.c | 3 + arch/x86/lib/delay.c | 51 +++++ tools/arch/x86/include/asm/cpufeatures.h | 5 +- 14 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/cpu/zxpause.c diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4ce54074eea5..8041b559ad2a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_C000_0006_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -81,8 +82,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -104,8 +106,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 469fa5c95694..7542ff456ede 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -398,6 +398,9 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 19 */ +#define X86_FEATURE_ZXPAUSE (19*32 + 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index de9e7841f953..777de5c62bf5 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -5,6 +5,7 @@ #include void use_tsc_delay(void); +void use_zxpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index d2e414cb3ef1..17701df637ea 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -84,6 +84,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4353c3b8f93a..c38444f4eed3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -880,4 +880,15 @@ #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 + +#define MSR_ZX_PAUSE_CONTROL 0x187f +#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) +#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) + +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) + #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 3aa82deeab5a..6f02d93586d0 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -23,6 +23,8 @@ #define MWAITX_MAX_LOOPS ((u32)-1) #define MWAITX_DISABLE_CSTATES 0xf0 +#define ZXPAUSE_C01_STATE 1 + static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { @@ -120,4 +122,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the ZX_PAUSE_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __zxpause(u32 ecx, u32 edx, u32 eax) +{ + /* "zxpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_ZXPAUSE + asm volatile("zxpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0xf2, 0x0f, 0xa6, 0xd0\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6847d85400a8..fa5700097f64 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index da019595504e..8b3f3f8f45dd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -25,6 +25,7 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += zxpause.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 169bea16a2e9..a057be1bfac6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -135,6 +135,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c) if (edx & (1U << 28)) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); } + + if (cpuid_eax(0xC0000000) >= 0xC0000006) + c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006); + if (detect_extended_topology_early(c) < 0) detect_ht_early(c); } diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index bc792474b55f..6b1c24308ea1 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -92,6 +92,9 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) if (c->x86 == 0x6 || (c->x86 == 0x7 && c->x86_model <= 0x3b)) set_cpu_cap(c, X86_FEATURE_CRC32C_LOW_PERF); + if (cpuid_eax(0xC0000000) >= 0xC0000006) + c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006); + if (detect_extended_topology_early(c) < 0) detect_ht_early(c); } diff --git a/arch/x86/kernel/cpu/zxpause.c b/arch/x86/kernel/cpu/zxpause.c new file mode 100644 index 000000000000..b0d4b5168c89 --- /dev/null +++ b/arch/x86/kernel/cpu/zxpause.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define ZXPAUSE_C02_ENABLE 1 + +#define ZXPAUSE_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_ZX_PAUSE_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_ZX_PAUSE_CONTROL_C02_DISABLE)) + +/* + * Cache ZX_PAUSE_CONTROL MSR. This is a systemwide control. By default, + * zxpause max time is 100000 in TSC-quanta and C0.2 is disabled + */ +static u32 zxpause_control_cached = ZXPAUSE_CTRL_VAL(100000, ZXPAUSE_C02_ENABLE); + +/* + * Cache the original ZX_PAUSE_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_zxpause_control_cached __ro_after_init; + +/* + * Serialize access to zxpause_control_cached and ZX_PAUSE_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(zxpause_lock); + +static void zxpause_update_control_msr(void *unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_ZX_PAUSE_CONTROL, READ_ONCE(zxpause_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of zxpause_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int zxpause_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + zxpause_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int zxpause_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_zxpause_control_cached is never changed after it caches + * the original control MSR value in zxpause_init(). So there + * is no race condition here. + */ + wrmsr(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached, 0); + + return 0; +} + +/* + * On resume, restore ZX_PAUSE_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void zxpause_syscore_resume(void) +{ + zxpause_update_control_msr(NULL); +} + +static struct syscore_ops zxpause_syscore_ops = { + .resume = zxpause_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in ZX_PAUSE_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool zxpause_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_ZX_PAUSE_CONTROL_C02_DISABLE); +} + +static inline u32 zxpause_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_ZX_PAUSE_CONTROL_TIME_MASK; +} + +static inline void zxpause_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_ZX_PAUSE_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_ZX_PAUSE_CONTROL_C02_DISABLE; + + WRITE_ONCE(zxpause_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(zxpause_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%d\n", zxpause_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (c02_enable != zxpause_ctrl_c02_enabled(ctrl)) + zxpause_update_control(ctrl, c02_enable); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%u\n", zxpause_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_ZX_PAUSE_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (max_time != zxpause_ctrl_max_time(ctrl)) + zxpause_update_control(max_time, zxpause_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *zxpause_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group zxpause_attr_group = { + .attrs = zxpause_attrs, + .name = "zxpause_control", +}; + +static int __init zxpause_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_ZXPAUSE)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_zxpause_control_cached + * is modified. + */ + rdmsrl(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "zxpause:online", + zxpause_cpu_online, zxpause_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&zxpause_syscore_ops); + + /* + * Add zxpause control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &zxpause_attr_group); +} +device_initcall(zxpause_init); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fddaefc51fb6..6fcc0a1e3c43 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -95,6 +95,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_ZXPAUSE)) + use_zxpause_delay(); } /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 614c2c6b1959..a5cc05bc22ea 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -85,6 +85,23 @@ static void delay_tsc(unsigned long __loops) preempt_enable(); } +/* + * On ZHAOXIN the ZXPAUSE instruction waits until any of: + * 1) the delta of TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in ZX_PAUSE_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void __delay_zxpause(u64 unused, u64 cycles) +{ + u64 until = cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + __zxpause(ZXPAUSE_C01_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the loop of the @@ -130,6 +147,35 @@ static void delay_mwaitx(unsigned long __loops) } } +/* + * Call a vendor specific function to delay for a given amount of time. Because + * these functions may return earlier than requested, check for actual elapsed + * time and call again until done. + */ +static void delay_zxpause(unsigned long __cycles) +{ + u64 start, end, cycles = __cycles; + + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (!cycles) + return; + + start = rdtsc_ordered(); + + for (;;) { + __delay_zxpause(start, cycles); + end = rdtsc_ordered(); + + if (cycles <= end - start) + break; + + cycles -= end - start; + start = end; + } +} /* * Since we calibrate only once at boot, this * function should be set once at boot and not changed @@ -142,6 +188,11 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } +void use_zxpause_delay(void) +{ + delay_fn = delay_zxpause; +} + void use_mwaitx_delay(void) { delay_fn = delay_mwaitx; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 968c54f8b7f7..b12c85014bb0 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -348,6 +348,9 @@ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 20 */ +#define X86_FEATURE_ZXPAUSE (19*32 + 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */ -- Gitee From 3a2427b1de69331b935600e7c88d5b5edbb20316 Mon Sep 17 00:00:00 2001 From: leoliu-oc Date: Tue, 16 Jan 2024 15:18:36 +0800 Subject: [PATCH 2/2] anolis: KVM: x86: Introduce support for Zhaoxin ZXPAUSE instruction ANBZ: #7918 This patch introduces support for the ZXPAUSE instruction, a new addition akin to Intel's TPAUSE. Two primary distinctions set apart ZXPAUSE from TPAUSE: 1. ZXPAUSE utilizes a delta tsc, determined from the lesser value between (MSR_ZX_PAUSE_CONTROL[31:2] << 2) and the EDX:EAX input to the ZXPAUSE instruction, subtracted from the current tsc value. In contrast, TPAUSE employs a target tsc, computed from the lesser value between (MSR_IA32_UMWAIT_CONTROL[31:2] << 2) and the EDX:EAX input to the TPAUSE instruction. 2. As of now, ZXPAUSE exclusively supports the C0.1 optimization state, whereas TPAUSE potentially extends support to both C0.1 and C0.2. Successful integration of this patch hinges on QEMU's backing for ZXPAUSE, a contribution we're currently forwarding to QEMU. It also requires the preceding patch in this patchset, which offers Linux kernel support for ZXPAUSE. The choice of the name "vmx->msr_ia32_umwait_control" is deliberate. In patches for other Linux versions (e.g., 5.5), a "vmx->msr_ia32_umwait_control" already exists. By sharing this variable name with Intel, it ensures compatibility. The difference is merely software-based and poses no real-world conflicts. Currently, if the Guest writes to the ZXPAUSE/TPAUSE CONTROL MSR, we simply bypass the WRMSR instruction. If the Guest attempts to use ZXPAUSE/TPAUSE to transition the vCPU into an optimized state, it will succeed, with the duration of the optimized state being the value passed in EDX:EAX. Of course, this state can be interrupted by external interrupts and other events specified in the specification. Signed-off-by: leoliu-oc --- arch/x86/include/asm/msr-index.h | 7 +++ arch/x86/include/asm/vmx.h | 7 +++ arch/x86/kvm/cpuid.c | 15 +++++- arch/x86/kvm/cpuid.h | 1 + arch/x86/kvm/vmx.c | 88 ++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 +- 6 files changed, 117 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c38444f4eed3..41f4381c3461 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -541,6 +541,13 @@ #define MSR_VIA_RNG 0x0000110b #define MSR_VIA_BCR2 0x00001147 +/* + * Zhaoxin extend VMCS capabilities: + * bit 0: exec-cntl3 VMCS field. + */ +#define MSR_ZX_EXT_VMCS_CAPS 0x1675 +#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) + /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 #define MSR_TMTA_LONGRUN_FLAGS 0x80868011 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9527ba5d62da..81ea0322bfa6 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -80,6 +80,11 @@ #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 +/* + * Definitions of Zhaoxin Tertiary Processor-Based VM-Execution Controls. + */ +#define ZX_TERTIARY_EXEC_GUEST_ZXPAUSE 0x00000001 + #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 @@ -218,6 +223,7 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + ZXPAUSE_VMEXIT_TSC = 0x00002200, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -264,6 +270,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c8c1436bdb68..f4daeccbad08 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -424,6 +424,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; + u32 ign, zx_cap; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -733,18 +734,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); + /* Extended to 0xC0000006 */ + entry->eax = min(entry->eax, 0xC0000006); break; case 0xC0000001: entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; + case 0xC0000006: + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_cap, &ign); + if (zx_cap & MSR_ZX_VMCS_EXEC_CTL3) { + /* Now only expose ZXPAUSE(VMCS exec ctl3) */ + entry->eax &= F(ZXPAUSE); + cpuid_mask(&entry->eax, CPUID_C000_0006_EAX); + } else + entry->eax = 0; + break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: + case 0xC0000005: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index e676bb7ba3f8..75b0f82b19de 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_C000_0006_EAX] = {0xc0000006, 0, CPUID_EAX}, }; static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7e25a72685cb..bd3dd5fd7451 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -195,6 +195,8 @@ static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); +static u32 zx_ext_vmcs_cap; + /* Storage for pre module init parameter parsing */ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; @@ -972,10 +974,12 @@ struct vcpu_vmx { #endif u64 spec_ctrl; + u32 msr_ia32_umwait_control; u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; u32 secondary_exec_control; + u32 zx_3rd_exec_control; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a @@ -1329,6 +1333,7 @@ static struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u32 zx_cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -4070,6 +4075,11 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static inline bool vmx_guest_zxpause_enabled(struct vcpu_vmx *vmx) +{ + return vmx->zx_3rd_exec_control & ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -4124,6 +4134,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && + !vmx_guest_zxpause_enabled(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; @@ -4208,6 +4225,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && + !vmx_guest_zxpause_enabled(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -4670,6 +4698,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + // Setup Zhaoxin exec-cntl3 VMCS field. + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) + vmcs_conf->zx_cpu_based_3rd_exec_ctrl = + ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + if (static_branch_unlikely(&enable_evmcs)) evmcs_sanitize_exec_ctrls(vmcs_conf); @@ -6454,6 +6487,7 @@ static bool vmx_rdseed_supported(void) SECONDARY_EXEC_RDSEED_EXITING; } + static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -6589,6 +6623,36 @@ static void ept_set_mmio_spte_mask(void) VMX_EPT_MISCONFIG_WX_VALUE); } +static inline bool vmx_zxpause_supported(void) +{ + return vmcs_config.zx_cpu_based_3rd_exec_ctrl & + ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + +static u32 vmx_zx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + u32 exec_control = vmcs_config.zx_cpu_based_3rd_exec_ctrl; + + /* + * Show errors if Qemu wants to enable guest_zxpause while + * vmx not support it. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_ZXPAUSE)) { + if (!vmx_zxpause_supported()) + pr_err("VMX not support guest_zxpause!\n"); + else + exec_control |= ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + } else + exec_control &= ~ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + + /* enable other features here */ + + vmx->zx_3rd_exec_control = exec_control; + return exec_control; +} + + #define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. @@ -6624,6 +6688,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->secondary_exec_control); } + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + vmcs_write32(ZX_TERTIARY_VM_EXEC_CONTROL, + vmx_zx_tertiary_exec_control(vmx)); + vmcs_write64(ZXPAUSE_VMEXIT_TSC, 0); + } + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -7892,6 +7962,10 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; int r = -ENOMEM, i; + u32 ign; + + // Caches Zhaoxin extend VMCS capabilities. + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_ext_vmcs_cap, &ign); rdmsrl_safe(MSR_EFER, &host_efer); @@ -9978,6 +10052,14 @@ static void dump_vmcs(void) if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + pr_err("*** Zhaoxin Specific Fields ***\n"); + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + pr_err("Zhaoxin TertiaryExec Cntl = 0x%016x\n", + vmcs_read32(ZX_TERTIARY_VM_EXEC_CONTROL)); + pr_err("ZXPAUSE Saved TSC = 0x%016llx\n", + vmcs_read64(ZXPAUSE_VMEXIT_TSC)); + } + pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), @@ -11364,6 +11446,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx->secondary_exec_control); } + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + vmcs_write32(ZX_TERTIARY_VM_EXEC_CONTROL, + vmx_zx_tertiary_exec_control(vmx)); + vmcs_write64(ZXPAUSE_VMEXIT_TSC, 0); + } + if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88bb1a405241..820521fad3d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1057,7 +1057,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, MSR_ZX_PAUSE_CONTROL }; static unsigned num_msrs_to_save; -- Gitee