From 71fb7b4c6d494dd95618ea5d1e86f7bce5768c49 Mon Sep 17 00:00:00 2001 From: leoliu-oc Date: Thu, 5 Feb 2026 11:08:10 +0800 Subject: [PATCH] kvm: x86: fix pauseopt soft lockup in VM-exit zhaoxin inclusion category: feature -------------------- The original PAUSEOPT implementation called kvm_vcpu_read_guest() in is_vmexit_during_pauseopt() to detect PAUSEOPT state on every VM-exit. When multiple vCPUs run across NUMA nodes, this frequent guest memory access can trigger NUMA page migration, causing TLB flush IPI deadlock and soft lockup. Fix by using vmcs_read64(PAUSEOPT_TARGET_TSC) to detect PAUSEOPT state instead of reading guest memory. A non-zero PAUSEOPT_TARGET_TSC value indicates the guest is in PAUSEOPT optimized state. Also move pauseopt state fields from kvm_vcpu_arch to vcpu_vmx to avoid KABI compatibility issues. Changes: - Remove is_vmexit_during_pauseopt() function - Use VMCS field PAUSEOPT_TARGET_TSC for state detection - Move pauseopt_interrupted/pauseopt_rip to vcpu_vmx as private fields - Initialize new fields in vmx_vcpu_reset() Signed-off-by: leoliu-oc --- arch/x86/include/asm/kvm_host.h | 11 -------- arch/x86/include/asm/msr-index.h | 2 +- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kernel/cpu/feat_ctl.c | 2 +- arch/x86/kvm/vmx/capabilities.h | 2 +- arch/x86/kvm/vmx/vmx.c | 45 ++++++++++---------------------- arch/x86/kvm/vmx/vmx.h | 4 ++- 7 files changed, 21 insertions(+), 47 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5bfc4a980939..1f5ae41d6a21 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1030,17 +1030,6 @@ struct kvm_vcpu_arch { #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; #endif - - /* - * Zhaoxin/Centaur extended software managed vcpu states. - * - pauseopt_interrupted: set when pauseopt optimized state interrupted - * by some vmexit. - * - pauseopt_rip: stores the guest RIP at the time of vmexit if the vmexit - * occurred during pauseopt optimized state. - * We will move these definitions to zhaoxin specific arch in the future. - */ - bool pauseopt_interrupted; - unsigned long pauseopt_rip; }; struct kvm_lpage_info { diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d72291729999..adde659696af 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -826,7 +826,7 @@ * bit 0: exec-cntl3 VMCS field. */ #define MSR_ZX_EXT_VMCS_CAPS 0x1675 -#define MSR_ZX_VMCS_EXEC_CTL3 BIT(0) +#define MSR_ZX_VMCS_EXEC_CTL3_EN BIT(0) /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index edd106370ac9..ca1c6ba062a1 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -291,7 +291,7 @@ enum vmcs_field { PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, NOTIFY_WINDOW = 0x00004024, - ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, + ZX_TERTIARY_VM_EXEC_CONTROL = 0x00004200, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index d0071c91d218..ee2aca2e62b5 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -36,7 +36,7 @@ static void init_zhaoxin_ext_capabilities(struct cpuinfo_x86 *c) err = rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &ext_vmcs_cap, &ign); - if (!(ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3)) + if (!(ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3_EN)) return; err = rdmsr_safe(MSR_ZX_VMX_PROCBASED_CTLS3, &ign, &msr_high); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index e0939a4a2b73..f287396720a9 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -60,8 +60,8 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; - u32 zx_cpu_based_3rd_exec_ctrl; u64 cpu_based_3rd_exec_ctrl; + u32 zx_cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; u64 misc; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2b2b3e39468e..1f0965677a66 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2664,7 +2664,7 @@ static int setup_zhaoxin_vmcs_controls(struct vmcs_config *vmcs_conf) * control, rather than a bit in the 2nd CPU-based control. */ rdmsr_safe(MSR_ZX_EXT_VMCS_CAPS, &zx_ext_vmcs_cap, &ign); - if (!(zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3)) + if (!(zx_ext_vmcs_cap & MSR_ZX_VMCS_EXEC_CTL3_EN)) return 0; ret = rdmsr_safe(MSR_ZX_VMX_PROCBASED_CTLS3, &ign, &msr_high); @@ -5023,6 +5023,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; + vmx->msr_pauseopt_control = 0; + vmx->pauseopt_in_progress = false; + vmx->pauseopt_rip = 0; vmx->msr_ia32_umwait_control = 0; @@ -7508,36 +7511,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_exit_irqoff(); } -static bool is_vmexit_during_pauseopt(struct kvm_vcpu *vcpu) -{ - uint8_t opcode[4]; - gpa_t gpa; - unsigned long rip; - const u32 pauseopt_opcode = 0xD0A60FF2; - u32 code; - - rip = kvm_rip_read(vcpu); - gpa = kvm_mmu_gva_to_gpa_read(vcpu, (gva_t)rip, NULL); - if (gpa == INVALID_GPA) - return false; - - if (kvm_vcpu_read_guest(vcpu, gpa, opcode, 4) != 0) - return false; - - code = le32_to_cpu(*(u32 *)opcode); - if (code == pauseopt_opcode) - return true; - - return false; -} - static void zx_vmx_vcpu_run_pre(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long new_rip; - if (vcpu->arch.pauseopt_interrupted) { + if (vmx->pauseopt_in_progress) { new_rip = kvm_rip_read(vcpu); - if (new_rip != vcpu->arch.pauseopt_rip) { + if (new_rip != vmx->pauseopt_rip) { /* * When the execution of PAUSEOPT in the guest is interrupted by * other events, causing a vmexit, the pauseopt target tsc should be @@ -7545,17 +7526,19 @@ static void zx_vmx_vcpu_run_pre(struct kvm_vcpu *vcpu) * avoiding re-enter pauseopt optimized state after enter guest. */ vmcs_write64(PAUSEOPT_TARGET_TSC, 0); - vcpu->arch.pauseopt_interrupted = false; - vcpu->arch.pauseopt_rip = 0; + vmx->pauseopt_in_progress = false; + vmx->pauseopt_rip = 0; } } } static void zx_vmx_vcpu_run_post(struct kvm_vcpu *vcpu) { - if (cpu_has_vmx_pauseopt() && is_vmexit_during_pauseopt(vcpu)) { - vcpu->arch.pauseopt_interrupted = true; - vcpu->arch.pauseopt_rip = kvm_rip_read(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (cpu_has_vmx_pauseopt() && vmcs_read64(PAUSEOPT_TARGET_TSC)) { + vmx->pauseopt_in_progress = true; + vmx->pauseopt_rip = kvm_rip_read(vcpu); } } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index fc5123313209..e3f4c68b192b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -281,7 +281,9 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 msr_pauseopt_control; + u32 msr_pauseopt_control; + bool pauseopt_in_progress; + unsigned long pauseopt_rip; /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a -- Gitee