diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4ce54074eea57921aa12b04f8ff052abaed72c6a..8041b559ad2a019a255acf0262ba43d3b07c5242 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_C000_0006_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -81,8 +82,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -104,8 +106,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 469fa5c956949a32358021706876babec8aa99b3..7542ff456ede50d7cb55e6a69134b4d0af919e6e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -398,6 +398,9 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 19 */ +#define X86_FEATURE_ZXPAUSE (19*32 + 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index de9e7841f953c8910c94a4d08be74f018b646b06..777de5c62bf5deeb3f7bcfdefae6711364c58f11 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -5,6 +5,7 @@ #include void use_tsc_delay(void); +void use_zxpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index d2e414cb3ef186ab84eb1bb304f177e15139a0c6..17701df637ea6d82d09f5a9773b854cb3c3f4cb1 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -84,6 +84,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 4353c3b8f93a539781c38f109b673f360a23822d..41f4381c34610b67fa7d87d6be38a4bd22eadc53 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -541,6 +541,13 @@ #define MSR_VIA_RNG 0x0000110b #define MSR_VIA_BCR2 0x00001147 +/* + * Zhaoxin extend VMCS capabilities: + * bit 0: exec-cntl3 VMCS field. + */ +#define MSR_ZX_EXT_VMCS_CAPS 0x1675 +#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) + /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 #define MSR_TMTA_LONGRUN_FLAGS 0x80868011 @@ -880,4 +887,15 @@ #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 + +#define MSR_ZX_PAUSE_CONTROL 0x187f +#define MSR_ZX_PAUSE_CONTROL_C02_DISABLE BIT(0) +#define MSR_ZX_PAUSE_CONTROL_RESERVED BIT(1) + +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_ZX_PAUSE_CONTROL_TIME_MASK (~0x03U) + #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 3aa82deeab5ae862111e242138267bb53ddc622a..6f02d93586d07d601d497457d7d0418645c8c9e0 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -23,6 +23,8 @@ #define MWAITX_MAX_LOOPS ((u32)-1) #define MWAITX_DISABLE_CSTATES 0xf0 +#define ZXPAUSE_C01_STATE 1 + static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { @@ -120,4 +122,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the ZX_PAUSE_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __zxpause(u32 ecx, u32 edx, u32 eax) +{ + /* "zxpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_ZXPAUSE + asm volatile("zxpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0xf2, 0x0f, 0xa6, 0xd0\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6847d85400a8b738ca2adf00ea95fdf8f349fafc..fa5700097f6474dcecef8da81b625d30b648ee60 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9527ba5d62daba4fe5ca96d9c1ba2986f9612ccc..81ea0322bfa656c57d2aacb7cbccd835bbbe499c 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -80,6 +80,11 @@ #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 +/* + * Definitions of Zhaoxin Tertiary Processor-Based VM-Execution Controls. + */ +#define ZX_TERTIARY_EXEC_GUEST_ZXPAUSE 0x00000001 + #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 @@ -218,6 +223,7 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + ZXPAUSE_VMEXIT_TSC = 0x00002200, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -264,6 +270,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index da019595504e6ca287a49743abf41fb046e7dfce..8b3f3f8f45dd8b26c3884515c1b95b14da7d4265 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -25,6 +25,7 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += zxpause.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 169bea16a2e9a2e772a918ac973582ee174798b1..a057be1bfac631cd260180992c3986ea166d7b51 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -135,6 +135,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c) if (edx & (1U << 28)) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); } + + if (cpuid_eax(0xC0000000) >= 0xC0000006) + c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006); + if (detect_extended_topology_early(c) < 0) detect_ht_early(c); } diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index bc792474b55f8e29ec249341f46e1d151f3db782..6b1c24308ea13dee25ade0ce470b66ea0540ceae 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -92,6 +92,9 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c) if (c->x86 == 0x6 || (c->x86 == 0x7 && c->x86_model <= 0x3b)) set_cpu_cap(c, X86_FEATURE_CRC32C_LOW_PERF); + if (cpuid_eax(0xC0000000) >= 0xC0000006) + c->x86_capability[CPUID_C000_0006_EAX] = cpuid_eax(0xC0000006); + if (detect_extended_topology_early(c) < 0) detect_ht_early(c); } diff --git a/arch/x86/kernel/cpu/zxpause.c b/arch/x86/kernel/cpu/zxpause.c new file mode 100644 index 0000000000000000000000000000000000000000..b0d4b5168c89f4f3cadc48b6a5999da7340f9017 --- /dev/null +++ b/arch/x86/kernel/cpu/zxpause.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include + +#define ZXPAUSE_C02_ENABLE 1 + +#define ZXPAUSE_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_ZX_PAUSE_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_ZX_PAUSE_CONTROL_C02_DISABLE)) + +/* + * Cache ZX_PAUSE_CONTROL MSR. This is a systemwide control. By default, + * zxpause max time is 100000 in TSC-quanta and C0.2 is disabled + */ +static u32 zxpause_control_cached = ZXPAUSE_CTRL_VAL(100000, ZXPAUSE_C02_ENABLE); + +/* + * Cache the original ZX_PAUSE_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_zxpause_control_cached __ro_after_init; + +/* + * Serialize access to zxpause_control_cached and ZX_PAUSE_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(zxpause_lock); + +static void zxpause_update_control_msr(void *unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_ZX_PAUSE_CONTROL, READ_ONCE(zxpause_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of zxpause_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int zxpause_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + zxpause_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int zxpause_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_zxpause_control_cached is never changed after it caches + * the original control MSR value in zxpause_init(). So there + * is no race condition here. + */ + wrmsr(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached, 0); + + return 0; +} + +/* + * On resume, restore ZX_PAUSE_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void zxpause_syscore_resume(void) +{ + zxpause_update_control_msr(NULL); +} + +static struct syscore_ops zxpause_syscore_ops = { + .resume = zxpause_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in ZX_PAUSE_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool zxpause_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_ZX_PAUSE_CONTROL_C02_DISABLE); +} + +static inline u32 zxpause_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_ZX_PAUSE_CONTROL_TIME_MASK; +} + +static inline void zxpause_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_ZX_PAUSE_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_ZX_PAUSE_CONTROL_C02_DISABLE; + + WRITE_ONCE(zxpause_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(zxpause_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%d\n", zxpause_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (c02_enable != zxpause_ctrl_c02_enabled(ctrl)) + zxpause_update_control(ctrl, c02_enable); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(zxpause_control_cached); + + return sprintf(buf, "%u\n", zxpause_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_ZX_PAUSE_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&zxpause_lock); + + ctrl = READ_ONCE(zxpause_control_cached); + if (max_time != zxpause_ctrl_max_time(ctrl)) + zxpause_update_control(max_time, zxpause_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&zxpause_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *zxpause_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group zxpause_attr_group = { + .attrs = zxpause_attrs, + .name = "zxpause_control", +}; + +static int __init zxpause_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_ZXPAUSE)) + return -ENODEV; + + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_zxpause_control_cached + * is modified. + */ + rdmsrl(MSR_ZX_PAUSE_CONTROL, orig_zxpause_control_cached); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "zxpause:online", + zxpause_cpu_online, zxpause_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } + + register_syscore_ops(&zxpause_syscore_ops); + + /* + * Add zxpause control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &zxpause_attr_group); +} +device_initcall(zxpause_init); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fddaefc51fb6d7ead6630ec8bfedae9bcb94d582..6fcc0a1e3c4330c2e3b8119d3000fe2c882e2e37 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -95,6 +95,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_ZXPAUSE)) + use_zxpause_delay(); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c8c1436bdb68c0979dace308f84d5406bd79f1cf..f4daeccbad087068246e750050566adfdd8d3d57 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -424,6 +424,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; + u32 ign, zx_cap; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -733,18 +734,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); + /* Extended to 0xC0000006 */ + entry->eax = min(entry->eax, 0xC0000006); break; case 0xC0000001: entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; + case 0xC0000006: + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_cap, &ign); + if (zx_cap & MSR_ZX_VMCS_EXEC_CTL3) { + /* Now only expose ZXPAUSE(VMCS exec ctl3) */ + entry->eax &= F(ZXPAUSE); + cpuid_mask(&entry->eax, CPUID_C000_0006_EAX); + } else + entry->eax = 0; + break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 0xC0000002: case 0xC0000003: case 0xC0000004: + case 0xC0000005: default: entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index e676bb7ba3f83531dbfcb9de2d94789819b7cfd7..75b0f82b19de5472a2c2d3774b10effdb23a6c16 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_C000_0006_EAX] = {0xc0000006, 0, CPUID_EAX}, }; static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7e25a72685cbf525cc087ab877b1513a2f808c6d..bd3dd5fd74512b776dda10260953b6b6ffbeb24f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -195,6 +195,8 @@ static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); +static u32 zx_ext_vmcs_cap; + /* Storage for pre module init parameter parsing */ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; @@ -972,10 +974,12 @@ struct vcpu_vmx { #endif u64 spec_ctrl; + u32 msr_ia32_umwait_control; u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; u32 secondary_exec_control; + u32 zx_3rd_exec_control; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a @@ -1329,6 +1333,7 @@ static struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u32 zx_cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -4070,6 +4075,11 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static inline bool vmx_guest_zxpause_enabled(struct vcpu_vmx *vmx) +{ + return vmx->zx_3rd_exec_control & ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -4124,6 +4134,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && + !vmx_guest_zxpause_enabled(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; @@ -4208,6 +4225,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; + case MSR_ZX_PAUSE_CONTROL: + if (!msr_info->host_initiated && + !vmx_guest_zxpause_enabled(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -4670,6 +4698,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + // Setup Zhaoxin exec-cntl3 VMCS field. + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) + vmcs_conf->zx_cpu_based_3rd_exec_ctrl = + ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + if (static_branch_unlikely(&enable_evmcs)) evmcs_sanitize_exec_ctrls(vmcs_conf); @@ -6454,6 +6487,7 @@ static bool vmx_rdseed_supported(void) SECONDARY_EXEC_RDSEED_EXITING; } + static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -6589,6 +6623,36 @@ static void ept_set_mmio_spte_mask(void) VMX_EPT_MISCONFIG_WX_VALUE); } +static inline bool vmx_zxpause_supported(void) +{ + return vmcs_config.zx_cpu_based_3rd_exec_ctrl & + ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; +} + +static u32 vmx_zx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + u32 exec_control = vmcs_config.zx_cpu_based_3rd_exec_ctrl; + + /* + * Show errors if Qemu wants to enable guest_zxpause while + * vmx not support it. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_ZXPAUSE)) { + if (!vmx_zxpause_supported()) + pr_err("VMX not support guest_zxpause!\n"); + else + exec_control |= ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + } else + exec_control &= ~ZX_TERTIARY_EXEC_GUEST_ZXPAUSE; + + /* enable other features here */ + + vmx->zx_3rd_exec_control = exec_control; + return exec_control; +} + + #define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. @@ -6624,6 +6688,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->secondary_exec_control); } + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + vmcs_write32(ZX_TERTIARY_VM_EXEC_CONTROL, + vmx_zx_tertiary_exec_control(vmx)); + vmcs_write64(ZXPAUSE_VMEXIT_TSC, 0); + } + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -7892,6 +7962,10 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; int r = -ENOMEM, i; + u32 ign; + + // Caches Zhaoxin extend VMCS capabilities. + rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_ext_vmcs_cap, &ign); rdmsrl_safe(MSR_EFER, &host_efer); @@ -9978,6 +10052,14 @@ static void dump_vmcs(void) if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + pr_err("*** Zhaoxin Specific Fields ***\n"); + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + pr_err("Zhaoxin TertiaryExec Cntl = 0x%016x\n", + vmcs_read32(ZX_TERTIARY_VM_EXEC_CONTROL)); + pr_err("ZXPAUSE Saved TSC = 0x%016llx\n", + vmcs_read64(ZXPAUSE_VMEXIT_TSC)); + } + pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), @@ -11364,6 +11446,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx->secondary_exec_control); } + if (zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3) { + vmcs_write32(ZX_TERTIARY_VM_EXEC_CONTROL, + vmx_zx_tertiary_exec_control(vmx)); + vmcs_write64(ZXPAUSE_VMEXIT_TSC, 0); + } + if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88bb1a405241277af785083337a855a63214c0d6..820521fad3d5c3251dec55c52c85537cf830cbc3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1057,7 +1057,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES, MSR_ZX_PAUSE_CONTROL }; static unsigned num_msrs_to_save; diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 614c2c6b195909fc5051f251100433a181b58195..a5cc05bc22eaf6b3c0c30f53b1da03f577e6ece6 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -85,6 +85,23 @@ static void delay_tsc(unsigned long __loops) preempt_enable(); } +/* + * On ZHAOXIN the ZXPAUSE instruction waits until any of: + * 1) the delta of TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in ZX_PAUSE_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void __delay_zxpause(u64 unused, u64 cycles) +{ + u64 until = cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + __zxpause(ZXPAUSE_C01_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the loop of the @@ -130,6 +147,35 @@ static void delay_mwaitx(unsigned long __loops) } } +/* + * Call a vendor specific function to delay for a given amount of time. Because + * these functions may return earlier than requested, check for actual elapsed + * time and call again until done. + */ +static void delay_zxpause(unsigned long __cycles) +{ + u64 start, end, cycles = __cycles; + + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (!cycles) + return; + + start = rdtsc_ordered(); + + for (;;) { + __delay_zxpause(start, cycles); + end = rdtsc_ordered(); + + if (cycles <= end - start) + break; + + cycles -= end - start; + start = end; + } +} /* * Since we calibrate only once at boot, this * function should be set once at boot and not changed @@ -142,6 +188,11 @@ void use_tsc_delay(void) delay_fn = delay_tsc; } +void use_zxpause_delay(void) +{ + delay_fn = delay_zxpause; +} + void use_mwaitx_delay(void) { delay_fn = delay_mwaitx; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 968c54f8b7f7e7011c334fa21eb9b8284fe7241b..b12c85014bb04aab906d0faa13655d17a6520cd2 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -348,6 +348,9 @@ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000006, word 20 */ +#define X86_FEATURE_ZXPAUSE (19*32 + 0) /* ZHAOXIN ZXPAUSE */ + /* * BUG word(s) */