From a07bf9a75901a7d03ae956018e2f6633302b5387 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 31 May 2024 04:46:42 +0000 Subject: [PATCH 01/40] KVM: SEV-ES: Prevent MSR access post VMSA encryption mainline inclusion from mainline- commit 27bd5fdc24c0d5d1306f968ef24105c4577242b0 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/27bd5fdc24c0d5d1306f968ef24105c4577242b0 -------------------------------- commit 27bd5fdc24c0d5d1306f968ef24105c4577242b0 upstream KVM currently allows userspace to read/write MSRs even after the VMSA is encrypted. This can cause unintentional issues if MSR access has side- effects. For ex, while migrating a guest, userspace could attempt to migrate MSR_IA32_DEBUGCTLMSR and end up unintentionally disabling LBRV on the target. Fix this by preventing access to those MSRs which are context switched via the VMSA, once the VMSA is encrypted. Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-2-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8692b7892d0f..f1e519c59038 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2884,10 +2884,24 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return -EINVAL; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && @@ -3050,6 +3064,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return -EINVAL; + switch (ecx) { case MSR_AMD64_TSC_RATIO: -- Gitee From 4548bbb20a629944769fd9541c6684f47b6fa055 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 31 May 2024 04:46:43 +0000 Subject: [PATCH 02/40] KVM: SEV-ES: Disallow SEV-ES guests when X86_FEATURE_LBRV is absent mainline inclusion from mainline- commit d922056215617eedfbdbc29fe49953423686fe5e category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/d922056215617eedfbdbc29fe49953423686fe5e -------------------------------- commit d922056215617eedfbdbc29fe49953423686fe5e upstream As documented in APM[1], LBR Virtualization must be enabled for SEV-ES guests. So, prevent SEV-ES guests when LBRV support is missing. [1]: AMD64 Architecture Programmer's Manual Pub. 40332, Rev. 4.07 - June 2023, Vol 2, 15.35.2 Enabling SEV-ES. https://bugzilla.kernel.org/attachment.cgi?id=304653 Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-3-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- arch/x86/kvm/svm/sev.c | 6 ++++++ arch/x86/kvm/svm/svm.c | 16 +++++++--------- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 63611572ae10..e36b7c03efcc 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2405,6 +2405,12 @@ void __init sev_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + if (is_x86_vendor_hygon() && hygon_csv_build >= 1810) { /* * Ths ASIDs from 1 to max_sev_asid are available for hygon diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f1e519c59038..fd14cf85f01a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -219,7 +219,7 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; @@ -5410,6 +5410,12 @@ static __init int svm_hardware_setup(void) nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5463,14 +5469,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.set_vnmi_pending = NULL; } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1108dd91b107..a4065e9e0775 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -41,6 +41,7 @@ extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; extern bool vnmi; +extern int lbrv; /* * Clean bits in VMCB. -- Gitee From 3e9310e623e96ef8f687c51032d7a18cc8738101 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 31 May 2024 04:46:44 +0000 Subject: [PATCH 03/40] KVM: SEV-ES: Delegate LBR virtualization to the processor mainline inclusion from mainline- commit b7e4be0a224fe5c6be30c1c8bdda8d2317ad6ba4 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/b7e4be0a224fe5c6be30c1c8bdda8d2317ad6ba4 -------------------------------- commit b7e4be0a224fe5c6be30c1c8bdda8d2317ad6ba4 upstream As documented in APM[1], LBR Virtualization must be enabled for SEV-ES guests. Although KVM currently enforces LBRV for SEV-ES guests, there are multiple issues with it: o MSR_IA32_DEBUGCTLMSR is still intercepted. Since MSR_IA32_DEBUGCTLMSR interception is used to dynamically toggle LBRV for performance reasons, this can be fatal for SEV-ES guests. For ex SEV-ES guest on Zen3: [guest ~]# wrmsr 0x1d9 0x4 KVM: entry failed, hardware error 0xffffffff EAX=00000004 EBX=00000000 ECX=000001d9 EDX=00000000 Fix this by never intercepting MSR_IA32_DEBUGCTLMSR for SEV-ES guests. No additional save/restore logic is required since MSR_IA32_DEBUGCTLMSR is of swap type A. o KVM will disable LBRV if userspace sets MSR_IA32_DEBUGCTLMSR before the VMSA is encrypted. Fix this by moving LBRV enablement code post VMSA encryption. [1]: AMD64 Architecture Programmer's Manual Pub. 40332, Rev. 4.07 - June 2023, Vol 2, 15.35.2 Enabling SEV-ES. https://bugzilla.kernel.org/attachment.cgi?id=304653 Fixes: 376c6d285017 ("KVM: SVM: Provide support for SEV-ES vCPU creation/loading") Co-developed-by: Nikunj A Dadhania Signed-off-by: Nikunj A Dadhania Signed-off-by: Ravi Bangoria Message-ID: <20240531044644.768-4-ravi.bangoria@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- arch/x86/kvm/svm/sev.c | 13 ++++++++----- arch/x86/kvm/svm/svm.c | 8 +++++++- arch/x86/kvm/svm/svm.h | 3 ++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e36b7c03efcc..917fb5db3806 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -785,6 +785,14 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, csv2_sync_reset_vmsa(svm); } + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -3220,7 +3228,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -3272,10 +3279,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fd14cf85f01a..4298065bc092 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_FLUSH_CMD, .always = false }, + { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -1024,7 +1025,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1034,6 +1035,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); @@ -1043,6 +1047,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a4065e9e0775..91c4b9f34b1c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,7 +32,7 @@ #define GUEST_PAT_WB_ATTR 0x0606060606060606 -#define MAX_DIRECT_ACCESS_MSRS 47 +#define MAX_DIRECT_ACCESS_MSRS 48 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -551,6 +551,7 @@ u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); -- Gitee From 71159eed9fcea42390d7374ce4dbb35a423a1b2c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 20 May 2024 15:45:59 -0700 Subject: [PATCH 04/40] x86/cpu/intel: Switch to new Intel CPU model defines mainline inclusion from mainline- commit 6568fc18c2f62b4f35092e9680fe39f3500f4767 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/6568fc18c2f62b4f35092e9680fe39f3500f4767 -------------------------------- commit 6568fc18c2f62b4f35092e9680fe39f3500f4767 upstream New CPU #defines encode vendor and family as well as model. Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20240520224620.9480-29-tony.luck%40intel.com Signed-off-by: Malathi --- arch/x86/kernel/cpu/intel.c | 108 ++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f3f4ca82bbb1..d3b42c682cab 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -72,19 +72,19 @@ static bool cpu_model_supports_sld __ro_after_init; */ static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) { - switch (c->x86_model) { - case INTEL_FAM6_CORE_YONAH: - case INTEL_FAM6_CORE2_MEROM: - case INTEL_FAM6_CORE2_MEROM_L: - case INTEL_FAM6_CORE2_PENRYN: - case INTEL_FAM6_CORE2_DUNNINGTON: - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_NEHALEM_G: - case INTEL_FAM6_NEHALEM_EP: - case INTEL_FAM6_NEHALEM_EX: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_WESTMERE_EP: - case INTEL_FAM6_SANDYBRIDGE: + switch (c->x86_vfm) { + case INTEL_CORE_YONAH: + case INTEL_CORE2_MEROM: + case INTEL_CORE2_MEROM_L: + case INTEL_CORE2_PENRYN: + case INTEL_CORE2_DUNNINGTON: + case INTEL_NEHALEM: + case INTEL_NEHALEM_G: + case INTEL_NEHALEM_EP: + case INTEL_NEHALEM_EX: + case INTEL_WESTMERE: + case INTEL_WESTMERE_EP: + case INTEL_SANDYBRIDGE: setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); } } @@ -106,9 +106,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) */ if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: + switch (c->x86_vfm) { + case INTEL_XEON_PHI_KNL: + case INTEL_XEON_PHI_KNM: break; default: return; @@ -134,32 +134,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) * - Release note from 20180108 microcode release */ struct sku_microcode { - u8 model; + u32 vfm; u8 stepping; u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, - { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, - { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, - { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL, 0x03, 0x23 }, - { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, - { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, - { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + { INTEL_KABYLAKE, 0x0B, 0x80 }, + { INTEL_KABYLAKE, 0x0A, 0x80 }, + { INTEL_KABYLAKE, 0x09, 0x80 }, + { INTEL_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_KABYLAKE_L, 0x09, 0x80 }, + { INTEL_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_BROADWELL, 0x04, 0x28 }, + { INTEL_BROADWELL_G, 0x01, 0x1b }, + { INTEL_BROADWELL_D, 0x02, 0x14 }, + { INTEL_BROADWELL_D, 0x03, 0x07000011 }, + { INTEL_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_HASWELL_L, 0x01, 0x21 }, + { INTEL_HASWELL_G, 0x01, 0x18 }, + { INTEL_HASWELL, 0x03, 0x23 }, + { INTEL_HASWELL_X, 0x02, 0x3b }, + { INTEL_HASWELL_X, 0x04, 0x10 }, + { INTEL_IVYBRIDGE_X, 0x04, 0x42a }, /* Observed in the wild */ - { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, - { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, + { INTEL_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_SANDYBRIDGE_X, 0x07, 0x712 }, }; static bool bad_spectre_microcode(struct cpuinfo_x86 *c) @@ -173,11 +173,8 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return false; - if (c->x86 != 6) - return false; - for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { - if (c->x86_model == spectre_bad_microcodes[i].model && + if (c->x86_vfm == spectre_bad_microcodes[i].vfm && c->x86_stepping == spectre_bad_microcodes[i].stepping) return (c->microcode <= spectre_bad_microcodes[i].microcode); } @@ -312,7 +309,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && + if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -345,11 +342,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ if (c->x86 == 6) { - switch (c->x86_model) { - case INTEL_FAM6_ATOM_SALTWELL_MID: - case INTEL_FAM6_ATOM_SALTWELL_TABLET: - case INTEL_FAM6_ATOM_SILVERMONT_MID: - case INTEL_FAM6_ATOM_AIRMONT_NP: + switch (c->x86_vfm) { + case INTEL_ATOM_SALTWELL_MID: + case INTEL_ATOM_SALTWELL_TABLET: + case INTEL_ATOM_SILVERMONT_MID: + case INTEL_ATOM_AIRMONT_NP: set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: @@ -393,7 +390,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ - if (c->x86 == 5 && c->x86_model == 9) { + if (c->x86_vfm == INTEL_QUARK_X1000) { pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } @@ -650,12 +647,13 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && - (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + if (boot_cpu_has(X86_FEATURE_CLFLUSH) && + (c->x86_vfm == INTEL_CORE2_DUNNINGTON || + c->x86_vfm == INTEL_NEHALEM_EX || + c->x86_vfm == INTEL_WESTMERE_EX)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && - ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + if (boot_cpu_has(X86_FEATURE_MWAIT) && c->x86_vfm == INTEL_ATOM_GOLDMONT) set_cpu_bug(c, X86_BUG_MONITOR); #ifdef CONFIG_X86_64 @@ -1290,9 +1288,9 @@ void handle_bus_lock(struct pt_regs *regs) * feature even though they do not enumerate IA32_CORE_CAPABILITIES. */ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), {} }; -- Gitee From 733d955bd59a909996af5399200ad0ee06f89331 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 2 Oct 2023 13:30:38 +0200 Subject: [PATCH 05/40] arch/x86: Remove now superfluous sentinel elem from ctl_table arrays mainline inclusion from mainline- commit 83e291d3f5964a6f7a4871db543a2ed31d1b8d07 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/83e291d3f5964a6f7a4871db543a2ed31d1b8d07 -------------------------------- commit 83e291d3f5964a6f7a4871db543a2ed31d1b8d07 upstream This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from sld_sysctl and itmt_kern_table. This removal is safe because register_sysctl_init and register_sysctl implicitly use the array size in addition to checking for the sentinel. Reviewed-by: Ingo Molnar Acked-by: Dave Hansen # for x86 Signed-off-by: Joel Granados Signed-off-by: Luis Chamberlain Signed-off-by: Malathi --- arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/itmt.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d3b42c682cab..eb3a731d8e4e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1023,7 +1023,6 @@ static struct ctl_table sld_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sld_mitigate_sysctl_init(void) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 67b980bcbb77..b22c758f41dc 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -74,7 +74,6 @@ static struct ctl_table itmt_kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static struct ctl_table_header *itmt_sysctl_header; -- Gitee From 005f84a38ddada9e2af6954ca99d49f9a6a744b5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 8 Aug 2024 06:29:34 +0000 Subject: [PATCH 06/40] x86/split_lock: Move Split and Bus lock code to a dedicated file mainline inclusion from mainline- commit 350afa8a1101f62ce31bc4ed6f69cf4b90ec4fa2 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/350afa8a1101f62ce31bc4ed6f69cf4b90ec4fa2 -------------------------------- commit 350afa8a1101f62ce31bc4ed6f69cf4b90ec4fa2 upstream Bus Lock Detect functionality on AMD platforms works identical to Intel. Move split_lock and bus_lock specific code from intel.c to a dedicated file so that it can be compiled and supported on non-Intel platforms. Also, introduce CONFIG_X86_BUS_LOCK_DETECT, make it dependent on CONFIG_CPU_SUP_INTEL and add compilation dependency of the new bus_lock.c file on CONFIG_X86_BUS_LOCK_DETECT. Signed-off-by: Ravi Bangoria Signed-off-by: Thomas Gleixner Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/all/20240808062937.1149-2-ravi.bangoria@amd.com Signed-off-by: Malathi --- arch/x86/Kconfig | 8 + arch/x86/include/asm/cpu.h | 11 +- arch/x86/kernel/cpu/Makefile | 2 + arch/x86/kernel/cpu/bus_lock.c | 406 +++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 406 --------------------------------- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- 7 files changed, 427 insertions(+), 410 deletions(-) create mode 100644 arch/x86/kernel/cpu/bus_lock.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e8c26aae7057..aa775e4b8d03 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2467,6 +2467,14 @@ config STRICT_SIGALTSTACK_SIZE source "kernel/livepatch/Kconfig" +config X86_BUS_LOCK_DETECT + bool "Split Lock Detect and Bus Lock Detect support" + depends on CPU_SUP_INTEL + default y + help + Enable Split Lock Detect and Bus Lock Detect functionalities. + See for more information. + endmenu config CC_HAS_SLS diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index f8f9a9b79395..43a228e11209 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -34,12 +34,13 @@ int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT extern void __init sld_setup(struct cpuinfo_x86 *c); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); -u8 get_this_hybrid_cpu_type(void); +void split_lock_init(void); +void bus_lock_init(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -53,7 +54,13 @@ static inline bool handle_guest_split_lock(unsigned long ip) } static inline void handle_bus_lock(struct pt_regs *regs) {} +static inline void split_lock_init(void) {} +static inline void bus_lock_init(void) {} +#endif +#ifdef CONFIG_CPU_SUP_INTEL +u8 get_this_hybrid_cpu_type(void); +#else static inline u8 get_this_hybrid_cpu_type(void) { return 0; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b96c8413bbc3..97b7f383e58c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -56,6 +56,8 @@ obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_BPF_RVI) += bpf-rvi.o proc.o +obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c new file mode 100644 index 000000000000..704e9241b964 --- /dev/null +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + + cpu = get_cpu(); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index eb3a731d8e4e..078871b5f544 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,13 +7,9 @@ #include #include #include -#include #include #include #include -#include -#include -#include #include #include @@ -24,8 +20,6 @@ #include #include #include -#include -#include #include #include #include @@ -41,28 +35,6 @@ #include #endif -enum split_lock_detect_state { - sld_off = 0, - sld_warn, - sld_fatal, - sld_ratelimit, -}; - -/* - * Default to sld_off because most systems do not support split lock detection. - * sld_state_setup() will switch this to sld_warn on systems that support - * split lock/bus lock detect, unless there is a command line override. - */ -static enum split_lock_detect_state sld_state __ro_after_init = sld_off; -static u64 msr_test_ctrl_cache __ro_after_init; - -/* - * With a name like MSR_TEST_CTL it should go without saying, but don't touch - * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it - * on CPUs that do not support SLD can cause fireworks, even when writing '0'. - */ -static bool cpu_model_supports_sld __ro_after_init; - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -598,9 +570,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } -static void split_lock_init(void); -static void bus_lock_init(void); - static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -994,381 +963,6 @@ static const struct cpu_dev intel_cpu_dev = { cpu_dev_register(intel_cpu_dev); -#undef pr_fmt -#define pr_fmt(fmt) "x86/split lock detection: " fmt - -static const struct { - const char *option; - enum split_lock_detect_state state; -} sld_options[] __initconst = { - { "off", sld_off }, - { "warn", sld_warn }, - { "fatal", sld_fatal }, - { "ratelimit:", sld_ratelimit }, -}; - -static struct ratelimit_state bld_ratelimit; - -static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem, 1); - -#ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { - { - .procname = "split_lock_mitigate", - .data = &sysctl_sld_mitigate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init sld_mitigate_sysctl_init(void) -{ - register_sysctl_init("kernel", sld_sysctls); - return 0; -} - -late_initcall(sld_mitigate_sysctl_init); -#endif - -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt), ratelimit; - - if (strncmp(arg, opt, len)) - return false; - - /* - * Min ratelimit is 1 bus lock/sec. - * Max ratelimit is 1000 bus locks/sec. - */ - if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && - ratelimit > 0 && ratelimit <= 1000) { - ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); - ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); - return true; - } - - return len == arglen; -} - -static bool split_lock_verify_msr(bool on) -{ - u64 ctrl, tmp; - - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) - return false; - if (on) - ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - else - ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) - return false; - rdmsrl(MSR_TEST_CTRL, tmp); - return ctrl == tmp; -} - -static void __init sld_state_setup(void) -{ - enum split_lock_detect_state state = sld_warn; - char arg[20]; - int i, ret; - - if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - ret = cmdline_find_option(boot_command_line, "split_lock_detect", - arg, sizeof(arg)); - if (ret >= 0) { - for (i = 0; i < ARRAY_SIZE(sld_options); i++) { - if (match_option(arg, ret, sld_options[i].option)) { - state = sld_options[i].state; - break; - } - } - } - sld_state = state; -} - -static void __init __split_lock_setup(void) -{ - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - if (!split_lock_verify_msr(true)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); -} - -/* - * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking - * is not implemented as one thread could undo the setting of the other - * thread immediately after dropping the lock anyway. - */ -static void sld_update_msr(bool on) -{ - u64 test_ctrl_val = msr_test_ctrl_cache; - - if (on) - test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); -} - -static void split_lock_init(void) -{ - /* - * #DB for bus lock handles ratelimit and #AC for split lock is - * disabled. - */ - if (sld_state == sld_ratelimit) { - split_lock_verify_msr(false); - return; - } - - if (cpu_model_supports_sld) - split_lock_verify_msr(sld_state != sld_off); -} - -static void __split_lock_reenable_unlock(struct work_struct *work) -{ - sld_update_msr(true); - up(&buslock_sem); -} - -static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); - -static void __split_lock_reenable(struct work_struct *work) -{ - sld_update_msr(true); -} -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); - -/* - * If a CPU goes offline with pending delayed work to re-enable split lock - * detection then the delayed work will be executed on some other CPU. That - * handles releasing the buslock_sem, but because it executes on a - * different CPU probably won't re-enable split lock detection. This is a - * problem on HT systems since the sibling CPU on the same core may then be - * left running with split lock detection disabled. - * - * Unconditionally re-enable detection here. - */ -static int splitlock_cpu_offline(unsigned int cpu) -{ - sld_update_msr(true); - - return 0; -} - -static void split_lock_warn(unsigned long ip) -{ - struct delayed_work *work; - int cpu; - - if (!current->reported_split_lock) - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); - current->reported_split_lock = 1; - - if (sysctl_sld_mitigate) { - /* - * misery factor #1: - * sleep 10ms before trying to execute split lock. - */ - if (msleep_interruptible(10) > 0) - return; - /* - * Misery factor #2: - * only allow one buslocked disabled core at a time. - */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; - } - - cpu = get_cpu(); - schedule_delayed_work_on(cpu, work, 2); - - /* Disable split lock detection on this CPU to make progress */ - sld_update_msr(false); - put_cpu(); -} - -bool handle_guest_split_lock(unsigned long ip) -{ - if (sld_state == sld_warn) { - split_lock_warn(ip); - return true; - } - - pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", - current->comm, current->pid, - sld_state == sld_fatal ? "fatal" : "bogus", ip); - - current->thread.error_code = 0; - current->thread.trap_nr = X86_TRAP_AC; - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - return false; -} -EXPORT_SYMBOL_GPL(handle_guest_split_lock); - -static void bus_lock_init(void) -{ - u64 val; - - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - - if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) { - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; - } else { - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; - } - - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); -} - -bool handle_user_split_lock(struct pt_regs *regs, long error_code) -{ - if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) - return false; - split_lock_warn(regs->ip); - return true; -} - -void handle_bus_lock(struct pt_regs *regs) -{ - switch (sld_state) { - case sld_off: - break; - case sld_ratelimit: - /* Enforce no more than bld_ratelimit bus locks/sec. */ - while (!__ratelimit(&bld_ratelimit)) - msleep(20); - /* Warn on the bus lock. */ - fallthrough; - case sld_warn: - pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", - current->comm, current->pid, regs->ip); - break; - case sld_fatal: - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - break; - } -} - -/* - * CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - */ -static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_ICELAKE_L, 0), - X86_MATCH_VFM(INTEL_ICELAKE_D, 0), - {} -}; - -static void __init split_lock_setup(struct cpuinfo_x86 *c) -{ - const struct x86_cpu_id *m; - u64 ia32_core_caps; - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; - - /* Check for CPUs that have support but do not enumerate it: */ - m = x86_match_cpu(split_lock_cpu_ids); - if (m) - goto supported; - - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - - /* - * Not all bits in MSR_IA32_CORE_CAPS are architectural, but - * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set - * it have split lock detection. - */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - goto supported; - - /* CPU is not in the model list and does not have the MSR bit: */ - return; - -supported: - cpu_model_supports_sld = true; - __split_lock_setup(); -} - -static void sld_state_show(void) -{ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) - return; - - switch (sld_state) { - case sld_off: - pr_info("disabled\n"); - break; - case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "x86/splitlock", NULL, splitlock_cpu_offline) < 0) - pr_warn("No splitlock CPU offline handler\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: warning on user-space bus_locks\n"); - } - break; - case sld_fatal: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", - boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? - " from non-WB" : ""); - } - break; - case sld_ratelimit: - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); - break; - } -} - -void __init sld_setup(struct cpuinfo_x86 *c) -{ - split_lock_setup(c); - sld_state_setup(); - sld_state_show(); -} - #define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 /** diff --git a/include/linux/sched.h b/include/linux/sched.h index cf394b636b79..ca6847b82f72 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -992,7 +992,7 @@ struct task_struct { #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT diff --git a/kernel/fork.c b/kernel/fork.c index 78663ca68160..a145026b4009 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1262,7 +1262,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->active_memcg = NULL; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif -- Gitee From fd81d8356cde007d29edfd0efb435fe72b4ddc6e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 8 Aug 2024 06:29:35 +0000 Subject: [PATCH 07/40] x86/bus_lock: Add support for AMD mainline inclusion from mainline- commit 408eb7417a92c5354c7be34f7425b305dfe30ad9 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/408eb7417a92c5354c7be34f7425b305dfe30ad9 -------------------------------- commit 408eb7417a92c5354c7be34f7425b305dfe30ad9 upstream Add Bus Lock Detect (called Bus Lock Trap in AMD docs) support for AMD platforms. Bus Lock Detect is enumerated with CPUID Fn0000_0007_ECX_x0 bit [24 / BUSLOCKTRAP]. It can be enabled through MSR_IA32_DEBUGCTLMSR. When enabled, hardware clears DR6[11] and raises a #DB exception on occurrence of Bus Lock if CPL > 0. More detail about the feature can be found in AMD APM[1]. [1]: AMD64 Architecture Programmer's Manual Pub. 40332, Rev. 4.07 - June 2023, Vol 2, 13.1.3.6 Bus Lock Trap https://bugzilla.kernel.org/attachment.cgi?id=304653 Signed-off-by: Ravi Bangoria Signed-off-by: Thomas Gleixner Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/all/20240808062937.1149-3-ravi.bangoria@amd.com Signed-off-by: PvsNarasimha --- Documentation/arch/x86/buslock.rst | 3 ++- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/intel.c | 1 - 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/arch/x86/buslock.rst b/Documentation/arch/x86/buslock.rst index 4c5a4822eeb7..31f1bfdff16f 100644 --- a/Documentation/arch/x86/buslock.rst +++ b/Documentation/arch/x86/buslock.rst @@ -26,7 +26,8 @@ Detection ========= Intel processors may support either or both of the following hardware -mechanisms to detect split locks and bus locks. +mechanisms to detect split locks and bus locks. Some AMD processors also +support bus lock detect. #AC exception for split lock detection -------------------------------------- diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aa775e4b8d03..dc713811092f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2469,7 +2469,7 @@ source "kernel/livepatch/Kconfig" config X86_BUS_LOCK_DETECT bool "Split Lock Detect and Bus Lock Detect support" - depends on CPU_SUP_INTEL + depends on CPU_SUP_INTEL || CPU_SUP_AMD default y help Enable Split Lock Detect and Bus Lock Detect functionalities. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f337ff292ca3..0318eb422d21 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1945,6 +1945,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) * reinit x86_vfm in case anything changed. */ init_cpu_x86_vfm(c); + bus_lock_init(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 078871b5f544..92a4d9e88b07 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -679,7 +679,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); split_lock_init(); - bus_lock_init(); intel_init_thermal(c); } -- Gitee From 03b55dac449175897a7cf1e13ba3c1f166a806ac Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:53 +0800 Subject: [PATCH 08/40] cpufreq: amd-pstate: Unify computation of {max,min,nominal,lowest_nonlinear}_freq mainline inclusion from mainline- commit 5547c0ebfc2efdab6ee93a7fd4d9c411ad87013e category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/5547c0ebfc2efdab6ee93a7fd4d9c411ad87013e -------------------------------- commit 5547c0ebfc2efdab6ee93a7fd4d9c411ad87013e upstream Currently the amd_get_{min, max, nominal, lowest_nonlinear}_freq() helpers computes the values of min_freq, max_freq, nominal_freq and lowest_nominal_freq respectively afresh from cppc_get_perf_caps(). This is not necessary as there are fields in cpudata to cache these values. To simplify this, add a single helper function named amd_pstate_init_freq() which computes all these frequencies at once, and caches it in cpudata. Use the cached values everywhere else in the code. Acked-by: Huang Rui Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Co-developed-by: Gautham R. Shenoy Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki Signed-off-by: Arukonda Rahul --- drivers/cpufreq/amd-pstate.c | 126 ++++++++++++++++------------------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 62575dda58c1..2ddffe822628 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -621,74 +621,22 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_get_min_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - /* Switch to khz */ - return cppc_perf.lowest_freq * 1000; + return READ_ONCE(cpudata->min_freq); } static int amd_get_max_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - u32 max_perf, max_freq, nominal_freq, nominal_perf; - u64 boost_ratio; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - nominal_freq = cppc_perf.nominal_freq; - nominal_perf = READ_ONCE(cpudata->nominal_perf); - max_perf = READ_ONCE(cpudata->highest_perf); - - boost_ratio = div_u64(max_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - - max_freq = nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT; - - /* Switch to khz */ - return max_freq * 1000; + return READ_ONCE(cpudata->max_freq); } static int amd_get_nominal_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - /* Switch to khz */ - return cppc_perf.nominal_freq * 1000; + return READ_ONCE(cpudata->nominal_freq); } static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) { - struct cppc_perf_caps cppc_perf; - u32 lowest_nonlinear_freq, lowest_nonlinear_perf, - nominal_freq, nominal_perf; - u64 lowest_nonlinear_ratio; - - int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; - - nominal_freq = cppc_perf.nominal_freq; - nominal_perf = READ_ONCE(cpudata->nominal_perf); - - lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; - - lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - - lowest_nonlinear_freq = nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT; - - /* Switch to khz */ - return lowest_nonlinear_freq * 1000; + return READ_ONCE(cpudata->lowest_nonlinear_freq); } static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) @@ -809,6 +757,53 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) schedule_work(&sched_prefcore_work); } +/** + * amd_pstate_init_freq: Initialize the max_freq, min_freq, + * nominal_freq and lowest_nonlinear_freq for + * the @cpudata object. + * + * Requires: highest_perf, lowest_perf, nominal_perf and + * lowest_nonlinear_perf members of @cpudata to be + * initialized. + * + * Returns 0 on success, non-zero value on failure. + */ +static int amd_pstate_init_freq(struct amd_cpudata *cpudata) +{ + int ret; + u32 min_freq; + u32 highest_perf, max_freq; + u32 nominal_perf, nominal_freq; + u32 lowest_nonlinear_perf, lowest_nonlinear_freq; + u32 boost_ratio, lowest_nonlinear_ratio; + struct cppc_perf_caps cppc_perf; + + + ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + min_freq = cppc_perf.lowest_freq * 1000; + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); + + highest_perf = READ_ONCE(cpudata->highest_perf); + boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); + max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); + lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + WRITE_ONCE(cpudata->min_freq, min_freq); + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + WRITE_ONCE(cpudata->max_freq, max_freq); + + return 0; +} + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -836,6 +831,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; + min_freq = amd_get_min_freq(cpudata); max_freq = amd_get_max_freq(cpudata); nominal_freq = amd_get_nominal_freq(cpudata); @@ -877,13 +876,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata2; } - /* Initial processor data capability frequencies */ - cpudata->max_freq = max_freq; - cpudata->min_freq = min_freq; cpudata->max_limit_freq = max_freq; cpudata->min_limit_freq = min_freq; - cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; policy->driver_data = cpudata; @@ -1294,6 +1288,10 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; + ret = amd_pstate_init_freq(cpudata); + if (ret) + goto free_cpudata1; + min_freq = amd_get_min_freq(cpudata); max_freq = amd_get_max_freq(cpudata); nominal_freq = amd_get_nominal_freq(cpudata); @@ -1310,12 +1308,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - /* Initial processor data capability frequencies */ - cpudata->max_freq = max_freq; - cpudata->min_freq = min_freq; - cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; - policy->driver_data = cpudata; cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); -- Gitee From 1225842c224f186e46675c0b4c88ebcc756b8fd1 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:58 +0800 Subject: [PATCH 09/40] cpufreq: amd-pstate: Add quirk for the pstate CPPC capabilities missing mainline inclusion from mainline- commit eb8b6c36820214df96e7e86d8614d93f6b028f28 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/eb8b6c36820214df96e7e86d8614d93f6b028f28 -------------------------------- commit eb8b6c36820214df96e7e86d8614d93f6b028f28 upstream Add quirks table to get CPPC capabilities issue fixed by providing correct perf or frequency values while driver loading. If CPPC capabilities are not defined in the ACPI tables or wrongly defined by platform firmware, it needs to use quick to get those issues fixed with correct workaround values to make pstate driver can be loaded even though there are CPPC capabilities errors. The workaround will match the broken BIOS which lack of CPPC capabilities nominal_freq and lowest_freq definition in the ACPI table. $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/lowest_freq 0 $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq 0 Acked-by: Huang Rui Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki Signed-off-by: Arukonda Rahul --- drivers/cpufreq/amd-pstate.c | 53 ++++++++++++++++++++++++++++++++++-- include/linux/amd-pstate.h | 6 ++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2ddffe822628..d86567ea06f7 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -68,6 +68,7 @@ static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; static bool amd_pstate_prefcore = true; +static struct quirk_entry *quirks; /* * AMD Energy Preference Performance (EPP) @@ -112,6 +113,41 @@ static unsigned int epp_values[] = { typedef int (*cppc_mode_transition_fn)(int); +static struct quirk_entry quirk_amd_7k62 = { + .nominal_freq = 2600, + .lowest_freq = 550, +}; + +static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) +{ + /** + * match the broken bios for family 17h processor support CPPC V2 + * broken BIOS lack of nominal_freq and lowest_freq capabilities + * definition in ACPI tables + */ + if (boot_cpu_has(X86_FEATURE_ZEN2)) { + quirks = dmi->driver_data; + pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); + return 1; + } + + return 0; +} + +static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = { + { + .callback = dmi_matched_7k62_bios_bug, + .ident = "AMD EPYC 7K62", + .matches = { + DMI_MATCH(DMI_BIOS_VERSION, "5.14"), + DMI_MATCH(DMI_BIOS_RELEASE, "12/12/2019"), + }, + .driver_data = &quirk_amd_7k62, + }, + {} +}; +MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table); + static inline int get_mode_idx_from_str(const char *str, size_t size) { int i; @@ -783,8 +819,16 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) if (ret) return ret; - min_freq = cppc_perf.lowest_freq * 1000; - nominal_freq = cppc_perf.nominal_freq; + if (quirks && quirks->lowest_freq) + min_freq = quirks->lowest_freq * 1000; + else + min_freq = cppc_perf.lowest_freq * 1000; + + if (quirks && quirks->nominal_freq) + nominal_freq = quirks->nominal_freq ; + else + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); highest_perf = READ_ONCE(cpudata->highest_perf); @@ -1624,6 +1668,11 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; + quirks = NULL; + + /* check if this machine need CPPC quirks */ + dmi_check_system(amd_pstate_quirks_table); + switch (cppc_state) { case AMD_PSTATE_UNDEFINED: /* Disable on the following configs by default: diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 68fc1bd8d851..e28c675a5b73 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -118,4 +118,10 @@ static const char * const amd_pstate_mode_string[] = { [AMD_PSTATE_GUIDED] = "guided", NULL, }; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + #endif /* _LINUX_AMD_PSTATE_H */ -- Gitee From c179bff3b4ed2d76cb64dca3b8b92d5c94d2714c Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Wed, 19 Jun 2024 23:40:17 +0800 Subject: [PATCH 10/40] cpufreq: amd-pstate: switch boot_cpu_has() to cpu_feature_enabled() mainline inclusion from mainline- commit c9fdaba8369ea6ca01728ed9c25c6ca1b90d3355 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/c9fdaba8369ea6ca01728ed9c25c6ca1b90d3355 -------------------------------- commit c9fdaba8369ea6ca01728ed9c25c6ca1b90d3355 upstream replace the usage of the deprecated boot_cpu_has() function with the modern cpu_feature_enabled() function. The switch to cpu_feature_enabled() ensures compatibility with the latest CPU feature detection mechanisms and improves code maintainability. Acked-by: Mario Limonciello Suggested-by: Borislav Petkov (AMD) Signed-off-by: Perry Yuan Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/f1567593ac5e1d38343067e9c681a8c4b0707038.1718811234.git.perry.yuan@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Arukonda Rahul --- drivers/cpufreq/amd-pstate.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index d86567ea06f7..4590b05f4bf7 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -125,7 +125,7 @@ static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) * broken BIOS lack of nominal_freq and lowest_freq capabilities * definition in ACPI tables */ - if (boot_cpu_has(X86_FEATURE_ZEN2)) { + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) { quirks = dmi->driver_data; pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); return 1; @@ -167,7 +167,7 @@ static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) u64 epp; int ret; - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { if (!cppc_req_cached) { epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req_cached); @@ -240,7 +240,7 @@ static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) int ret; struct cppc_perf_ctrls perf_ctrls; - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { u64 value = READ_ONCE(cpudata->cppc_req_cached); value &= ~GENMASK_ULL(31, 24); @@ -740,7 +740,7 @@ static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) { int ret; - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { u64 cap1; ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); @@ -903,7 +903,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - if (boot_cpu_has(X86_FEATURE_CPPC)) + if (cpu_feature_enabled(X86_FEATURE_CPPC)) policy->fast_switch_possible = true; ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], @@ -1135,7 +1135,7 @@ static int amd_pstate_change_mode_without_dvr_change(int mode) cppc_state = mode; - if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) + if (cpu_feature_enabled(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) return 0; for_each_present_cpu(cpu) { @@ -1369,7 +1369,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) else policy->policy = CPUFREQ_POLICY_POWERSAVE; - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); if (ret) return ret; @@ -1453,7 +1453,7 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) epp = 0; /* Set initial EPP value */ - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { value &= ~GENMASK_ULL(31, 24); value |= (u64)epp << 24; } @@ -1492,7 +1492,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) value = READ_ONCE(cpudata->cppc_req_cached); max_perf = READ_ONCE(cpudata->highest_perf); - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); } else { perf_ctrls.max_perf = max_perf; @@ -1526,7 +1526,7 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) value = READ_ONCE(cpudata->cppc_req_cached); mutex_lock(&amd_pstate_limits_lock); - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; /* Set max perf same as min perf */ @@ -1682,7 +1682,7 @@ static int __init amd_pstate_init(void) */ if (amd_pstate_acpi_pm_profile_undefined() || amd_pstate_acpi_pm_profile_server() || - !boot_cpu_has(X86_FEATURE_CPPC)) { + !cpu_feature_enabled(X86_FEATURE_CPPC)) { pr_info("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } @@ -1701,7 +1701,7 @@ static int __init amd_pstate_init(void) } /* capability check */ - if (boot_cpu_has(X86_FEATURE_CPPC)) { + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { pr_debug("AMD CPPC MSR based functionality is supported\n"); if (cppc_state != AMD_PSTATE_ACTIVE) current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; -- Gitee From 80343960172f43cd57a6e23c3840a5a4165fe60e Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Wed, 19 Jun 2024 23:40:18 +0800 Subject: [PATCH 11/40] cpufreq: amd-pstate: enable shared memory type CPPC by default mainline inclusion from mainline- commit 918263938c41dac46935609477672442e8fa5d9e category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/918263938c41dac46935609477672442e8fa5d9e -------------------------------- commit 918263938c41dac46935609477672442e8fa5d9e upstream The amd-pstate-epp driver has been implemented and resolves the performance drop issue seen in passive mode for shared memory type CPPC systems. Users who enable the active mode driver will not experience a performance drop compared to the passive mode driver. Therefore, the EPP driver should be loaded by default for shared memory type CPPC system to get better performance. Signed-off-by: Perry Yuan Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/c705507cf3ee790e544251cfd897ed11e8e57712.1718811234.git.perry.yuan@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Arukonda Rahul --- drivers/cpufreq/amd-pstate.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 4590b05f4bf7..2ad5f1750b0c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -53,15 +53,6 @@ #define CPPC_HIGHEST_PERF_PERFORMANCE 196 #define CPPC_HIGHEST_PERF_DEFAULT 166 -/* - * TODO: We need more time to fine tune processors with shared memory solution - * with community together. - * - * There are some performance drops on the CPU benchmarks which reports from - * Suse. We are co-working with them to fine tune the shared memory solution. So - * we disable it by default to go acpi-cpufreq on these processors and add a - * module parameter to be able to enable it manually for debugging. - */ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; @@ -1678,11 +1669,9 @@ static int __init amd_pstate_init(void) /* Disable on the following configs by default: * 1. Undefined platforms * 2. Server platforms - * 3. Shared memory designs */ if (amd_pstate_acpi_pm_profile_undefined() || - amd_pstate_acpi_pm_profile_server() || - !cpu_feature_enabled(X86_FEATURE_CPPC)) { + amd_pstate_acpi_pm_profile_server()) { pr_info("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } -- Gitee From 206a2fcb75e2b178e0d1725d6cc43b1fc14f0031 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Mon, 21 Oct 2024 15:48:36 +0530 Subject: [PATCH 12/40] amd-pstate: Switch to amd-pstate by default on some Server platforms mainline inclusion from mainline- commit 54ab7d7c59c9444a28b67eaa10983f86ebf73c69 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/54ab7d7c59c9444a28b67eaa10983f86ebf73c69 -------------------------------- commit 54ab7d7c59c9444a28b67eaa10983f86ebf73c69 upstream Currently the default cpufreq driver for all the AMD EPYC servers is acpi-cpufreq. Going forward, switch to amd-pstate as the default driver on the AMD EPYC server platforms with CPU family 0x1A or higher. The default mode will be active mode. Testing shows that amd-pstate with active mode and performance governor provides comparable or better performance per-watt against acpi-cpufreq + performance governor. Likewise, amd-pstate with active mode and powersave governor with the energy_performance_preference=power (EPP=255) provides comparable or better performance per-watt against acpi-cpufreq + schedutil governor for a wide range of workloads. Users can still revert to using acpi-cpufreq driver on these platforms with the "amd_pstate=disable" kernel commandline parameter. Signed-off-by: Swapnil Sapkal Signed-off-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20241021101836.9047-3-gautham.shenoy@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Arukonda Rahul --- drivers/cpufreq/amd-pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2ad5f1750b0c..1a5c542fbd23 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1668,10 +1668,10 @@ static int __init amd_pstate_init(void) case AMD_PSTATE_UNDEFINED: /* Disable on the following configs by default: * 1. Undefined platforms - * 2. Server platforms + * 2. Server platforms with CPUs older than Family 0x1A. */ if (amd_pstate_acpi_pm_profile_undefined() || - amd_pstate_acpi_pm_profile_server()) { + (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) { pr_info("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } -- Gitee From 8667c8c688e46abc9a68b9efe9869ddeb1daacbb Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:30 +0000 Subject: [PATCH 13/40] perf/amd/ibs: Remove IBS_{FETCH|OP}_CONFIG_MASK macros mainline inclusion from mainline- commit 003c0414318a1829a1a5b195ad81e8a7960c3f5d category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/003c0414318a1829a1a5b195ad81e8a7960c3f5d -------------------------------- commit 003c0414318a1829a1a5b195ad81e8a7960c3f5d upstream Definition of these macros are very simple and they are used at only one place. Get rid of unnecessary redirection. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-2-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c3a2f6f57770..063e8f83bc57 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,9 +28,6 @@ static u32 ibs_caps; #include #include -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - /* * IBS states: @@ -670,7 +667,7 @@ static struct perf_ibs perf_ibs_fetch = { .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, + .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, @@ -694,7 +691,7 @@ static struct perf_ibs perf_ibs_op = { .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, + .config_mask = IBS_OP_MAX_CNT, .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, -- Gitee From 3af2201e63aaf8b1924f53b2ff43bd5f25baed79 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:31 +0000 Subject: [PATCH 14/40] perf/amd/ibs: Remove pointless sample period check mainline inclusion from mainline- commit 88c7bcad71c83f52f24108dedcecae0d18dbc627 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/88c7bcad71c83f52f24108dedcecae0d18dbc627 -------------------------------- commit 88c7bcad71c83f52f24108dedcecae0d18dbc627 upstream Valid perf event sample period value for IBS PMUs (Fetch and Op both) is limited to multiple of 0x10. perf_ibs_init() has this check: if (!event->attr.sample_freq && hwc->sample_period & 0x0f) return -EINVAL; But it's broken since hwc->sample_period will always be 0 when event->attr.sample_freq is 0 (irrespective of event->attr.freq value.) One option to fix this is to change the condition: - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + if (!event->attr.freq && hwc->sample_period & 0x0f) However, that will break all userspace tools which have been using IBS event with sample_period not multiple of 0x10. Another option is to remove the condition altogether and mask lower nibble _silently_, same as what current code is inadvertently doing. I'm preferring this approach as it keeps the existing behavior. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-3-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 063e8f83bc57..67b9d619a908 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -295,13 +295,8 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; + + /* Silently mask off lower nibble. IBS hw mandates it. */ hwc->sample_period &= ~0x0FULL; if (!hwc->sample_period) hwc->sample_period = 0x10; -- Gitee From c79f4192717b7000a075d621c93dc9c5fc293ea0 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:32 +0000 Subject: [PATCH 15/40] perf/amd/ibs: Fix ->config to sample period calculation for OP PMU mainline inclusion from mainline- commit 598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb -------------------------------- commit 598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb upstream Instead of using standard perf_event_attr->freq=0 and ->sample_period fields, IBS event in 'sample period mode' can also be opened by setting period value directly in perf_event_attr->config in a MaxCnt bit-field format. IBS OP MaxCnt bits are defined as: (high bits) IbsOpCtl[26:20] = IbsOpMaxCnt[26:20] (low bits) IbsOpCtl[15:0] = IbsOpMaxCnt[19:4] Perf event sample period can be derived from MaxCnt bits as: sample_period = (high bits) | ((low_bits) << 4); However, current code just masks MaxCnt bits and shifts all of them, including high bits, which is incorrect. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-4-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 67b9d619a908..bd12b891ad43 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -269,7 +269,7 @@ static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; - u64 max_cnt, config; + u64 config; int ret; perf_ibs = get_ibs_pmu(event->attr.type); @@ -301,10 +301,19 @@ static int perf_ibs_init(struct perf_event *event) if (!hwc->sample_period) hwc->sample_period = 0x10; } else { - max_cnt = config & perf_ibs->cnt_mask; + u64 period = 0; + + if (perf_ibs == &perf_ibs_op) { + period = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + period |= config & IBS_OP_MAX_CNT_EXT_MASK; + } else { + period = (config & IBS_FETCH_MAX_CNT) << 4; + } + config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; + event->attr.sample_period = period; + hwc->sample_period = period; } if (!hwc->sample_period) -- Gitee From cfb17be261db82617848fa4bfbd6decce7ca1cb9 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:33 +0000 Subject: [PATCH 16/40] perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt mainline inclusion from mainline- commit 46dcf85566170d4528b842bf83ffc350d71771fa category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/46dcf85566170d4528b842bf83ffc350d71771fa -------------------------------- commit 46dcf85566170d4528b842bf83ffc350d71771fa upstream IBS Op uses two counters: MaxCnt and CurCnt. MaxCnt is programmed with the desired sample period. IBS hw generates sample when CurCnt reaches to MaxCnt. The size of these counter used to be 20 bits but later they were extended to 27 bits. The 7 bit extension is indicated by CPUID Fn8000_001B_EAX[6 / OpCntExt]. perf_ibs->cnt_mask variable contains bit masks for MaxCnt and CurCnt. But IBS driver does not set upper 7 bits of CurCnt in cnt_mask even when OpCntExt CPUID bit is set. Fix this. IBS driver uses cnt_mask[CurCnt] bits only while disabling an event. Fortunately, CurCnt bits are not read from MSR while re-enabling the event, instead MaxCnt is programmed with desired period and CurCnt is set to 0. Hence, we did not see any issues so far. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-5-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 3 ++- arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index bd12b891ad43..bf45239b4c52 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1223,7 +1223,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 066721067f8a..af5ea333bc58 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -498,6 +498,7 @@ struct pebs_xmm { */ #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) -- Gitee From 0f6cd60a40832beddc7151ca22e756b62390a6ab Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:34 +0000 Subject: [PATCH 17/40] perf/amd/ibs: Don't allow freq mode event creation through ->config interface mainline inclusion from mainline- commit e1e7844ced88f9558a48579390a7d4eaac6a28eb category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/e1e7844ced88f9558a48579390a7d4eaac6a28eb -------------------------------- commit e1e7844ced88f9558a48579390a7d4eaac6a28eb upstream Most perf_event_attr->config bits directly maps to IBS_{FETCH|OP}_CTL MSR. Since the sample period is programmed in these control registers, IBS PMU driver allows opening an IBS event by setting sample period value directly in perf_event_attr->config instead of using explicit perf_event_attr->sample_period interface. However, this logic is not applicable for freq mode events since the semantics of control register fields are applicable only to fixed sample period whereas the freq mode event adjusts sample period after each and every sample. Currently, IBS driver (unintentionally) allows creating freq mode event via ->config interface, which is semantically wrong as well as detrimental because it can be misused to bypass perf_event_max_sample_rate checks. Don't allow freq mode event creation through perf_event_attr->config interface. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-6-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index bf45239b4c52..401db361acfa 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -303,6 +303,9 @@ static int perf_ibs_init(struct perf_event *event) } else { u64 period = 0; + if (event->attr.freq) + return -EINVAL; + if (perf_ibs == &perf_ibs_op) { period = (config & IBS_OP_MAX_CNT) << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT) -- Gitee From 083a65c2715e1c51cc1bfda4153125d14b9bc7b5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:35 +0000 Subject: [PATCH 18/40] perf/amd/ibs: Add PMU specific minimum period mainline inclusion from mainline- commit b2fc7b282bf7c1253b01c8da84e894539a3e709d category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/b2fc7b282bf7c1253b01c8da84e894539a3e709d -------------------------------- commit b2fc7b282bf7c1253b01c8da84e894539a3e709d upstream 0x10 is the minimum sample period for IBS Fetch and 0x90 for IBS Op. Current IBS PMU driver uses 0x10 for both the PMUs, which is incorrect. Fix it by adding PMU specific minimum period values in struct perf_ibs. Also, bail out opening a 'sample period mode' event if the user requested sample period is less than PMU supported minimum value. For a 'freq mode' event, start calibrating sample period from PMU specific minimum period. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-7-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 401db361acfa..4eb84d3e8124 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -84,6 +84,7 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u16 min_period; u64 max_period; unsigned long offset_mask[1]; int offset_max; @@ -296,10 +297,14 @@ static int perf_ibs_init(struct perf_event *event) /* raw max_cnt may not be set */ return -EINVAL; - /* Silently mask off lower nibble. IBS hw mandates it. */ - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; + if (event->attr.freq) { + hwc->sample_period = perf_ibs->min_period; + } else { + /* Silently mask off lower nibble. IBS hw mandates it. */ + hwc->sample_period &= ~0x0FULL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } } else { u64 period = 0; @@ -317,10 +322,10 @@ static int perf_ibs_init(struct perf_event *event) config &= ~perf_ibs->cnt_mask; event->attr.sample_period = period; hwc->sample_period = period; - } - if (!hwc->sample_period) - return -EINVAL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } /* * If we modify hwc->sample_period, we also need to update @@ -341,7 +346,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, int overflow; /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, perf_ibs->min_period, + perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); return overflow; @@ -678,6 +684,7 @@ static struct perf_ibs perf_ibs_fetch = { .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .min_period = 0x10, .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, @@ -703,6 +710,7 @@ static struct perf_ibs perf_ibs_op = { IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .min_period = 0x90, .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, -- Gitee From b5490e4847da5a7983c9f2e6768c3af192b67a91 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:36 +0000 Subject: [PATCH 19/40] perf/amd/ibs: Add ->check_period() callback mainline inclusion from mainline- commit 1afbdd970f50f2e0431fae26b25d4e54e561fa7f category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/1afbdd970f50f2e0431fae26b25d4e54e561fa7f -------------------------------- commit 1afbdd970f50f2e0431fae26b25d4e54e561fa7f upstream IBS Fetch and IBS Op PMUs have constraints on sample period. The sample period is verified at the time of opening an event but not at the ioctl() interface. Hence, a user can open an event with valid period but change it later with ioctl(). Add a ->check_period() callback to verify the period provided at ioctl() is also valid. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-8-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4eb84d3e8124..6839cceb96f2 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -552,6 +552,28 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +static int perf_ibs_check_period(struct perf_event *event, u64 value) +{ + struct perf_ibs *perf_ibs; + u64 low_nibble; + + if (event->attr.freq) + return 0; + + perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + low_nibble = value & 0xFULL; + + /* + * This contradicts with perf_ibs_init() which allows sample period + * with lower nibble bits set but silently masks them off. Whereas + * this returns error. + */ + if (low_nibble || value < perf_ibs->min_period) + return -EINVAL; + + return 0; +} + /* * We need to initialize with empty group if all attributes in the * group are dynamic. @@ -678,6 +700,7 @@ static struct perf_ibs perf_ibs_fetch = { .stop = perf_ibs_stop, .read = perf_ibs_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, @@ -703,6 +726,7 @@ static struct perf_ibs perf_ibs_op = { .stop = perf_ibs_stop, .read = perf_ibs_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_MAX_CNT, -- Gitee From c4207307f5ba032d34a60f3dff51a8ca097826b3 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:37 +0000 Subject: [PATCH 20/40] perf/amd/ibs: Ceil sample_period to min_period mainline inclusion from mainline- commit fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 -------------------------------- commit fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 upstream The sample_period needs to be recalibrated after every sample to match the desired sampling freq for a 'freq mode event'. Since the next sample_period is calculated by generic kernel, PMU specific constraints are not (explicitly) reckoned. The sample_period value is programmed in a MaxCnt field of IBS PMUs, and the MaxCnt field has following constraints: 1) MaxCnt must be multiple of 0x10. Kernel keeps track of residual / over-counted period into period_left, which should take care of this constraint by programming MaxCnt with (sample_period & ~0xF) and adding remaining period into the next sample. 2) MaxCnt must be >= 0x10 for IBS Fetch PMU and >= 0x90 for IBS Op PMU. Currently, IBS PMU driver allows sample_period below min_period, which is an undefined HW behavior. Reset sample_period to min_period whenever it's less than that. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250115054438.1021-9-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty --- arch/x86/events/amd/ibs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 6839cceb96f2..38215fd06a46 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -445,6 +445,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { config |= period & IBS_OP_MAX_CNT_EXT_MASK; @@ -1169,6 +1172,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); + + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + out: if (throttle) { perf_ibs_stop(event, 0); -- Gitee From 1075f26659f7d9b4a28c80a7a7faee29adac5058 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:41 +0000 Subject: [PATCH 21/40] perf/amd/ibs: Add support for OP Load Latency Filtering mainline inclusion from mainline- commit d20610c19b4a22bc69085b7eb7a02741d51de30e category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/d20610c19b4a22bc69085b7eb7a02741d51de30e -------------------------------- commit d20610c19b4a22bc69085b7eb7a02741d51de30e upstream IBS Op PMU on Zen5 uarch added new Load Latency filtering capability. It's advertised by CPUID_Fn8000001B_EAX bit 12. When enabled, IBS HW will raise interrupt only for sample that had an IbsDcMissLat value greater than N cycles, where N is a programmable value defined as multiples of 128 (i.e. 128, 256, 384 etc.) from 128-2048 cycles. Similar to L3MissOnly, IBS HW internally drops the sample and restarts if the sample does not meet the filtering criteria. Add support for LdLat filtering in IBS Op PMU. Since hardware supports threshold in multiple of 128, add a software filter on top to support latency threshold with the granularity of 1 cycle between [128-2048]. Example usage: # perf record -a -e ibs_op/ldlat=128/ -- sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-2-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty --- arch/x86/events/amd/ibs.c | 93 ++++++++++++++++++++++++++++--- arch/x86/include/asm/amd-ibs.h | 3 +- arch/x86/include/asm/perf_event.h | 3 + 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 38215fd06a46..0aacedecf08a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -266,6 +266,14 @@ static int validate_group(struct perf_event *event) return 0; } +static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + return perf_ibs == &perf_ibs_op && + (ibs_caps & IBS_CAPS_OPLDLAT) && + (event->attr.config1 & 0xFFF); +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -327,6 +335,17 @@ static int perf_ibs_init(struct perf_event *event) return -EINVAL; } + if (perf_ibs_ldlat_event(perf_ibs, event)) { + u64 ldlat = event->attr.config1 & 0xFFF; + + if (ldlat < 128 || ldlat > 2048) + return -EINVAL; + ldlat >>= 7; + + config |= (ldlat - 1) << 59; + config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; + } + /* * If we modify hwc->sample_period, we also need to update * hwc->last_period and hwc->period_left. @@ -605,7 +624,9 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -613,6 +634,12 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } +static umode_t +ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; +} + static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, @@ -628,6 +655,11 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_cap_attrs[] = { + &ibs_op_ldlat_cap.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -645,6 +677,12 @@ static struct attribute_group group_zen4_ibs_extensions = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_cap = { + .name = "caps", + .attrs = ibs_op_ldlat_cap_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, &empty_caps_group, @@ -673,6 +711,11 @@ static struct attribute *op_l3missonly_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_format_attrs[] = { + &ibs_op_ldlat_format.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -685,10 +728,18 @@ static struct attribute_group group_op_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_format = { + .name = "format", + .attrs = ibs_op_ldlat_format_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, &group_zen4_ibs_extensions, + &group_ibs_op_ldlat_cap, + &group_ibs_op_ldlat_format, NULL, }; @@ -1043,15 +1094,25 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } } -static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, +static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return perf_ibs == &perf_ibs_op && + sample_type & (PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_WEIGHT_TYPE | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_PHYS_ADDR); +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, + struct perf_event *event, int check_rip) { - if (sample_type & PERF_SAMPLE_RAW || - (perf_ibs == &perf_ibs_op && - (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR || - sample_type & PERF_SAMPLE_PHYS_ADDR))) + if (event->attr.sample_type & PERF_SAMPLE_RAW || + perf_ibs_is_mem_sample_type(perf_ibs, event) || + perf_ibs_ldlat_event(perf_ibs, event)) return perf_ibs->offset_max; else if (check_rip) return 3; @@ -1106,7 +1167,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { rdmsrl(msr + offset, *buf++); @@ -1115,6 +1176,22 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + + if (perf_ibs_ldlat_event(perf_ibs, event)) { + union ibs_op_data3 op_data3; + + op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + /* + * Opening event is errored out if load latency threshold is + * outside of [128, 2048] range. Since the event has reached + * interrupt handler, we can safely assume the threshold is + * within [128, 2048] range. + */ + if (!op_data3.ld_op || !op_data3.dc_miss || + op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) + goto out; + } + /* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index cb2a5e113daa..77f3a589a99a 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index af5ea333bc58..1f1c7e154913 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -471,6 +471,7 @@ struct pebs_xmm { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -496,6 +497,8 @@ struct pebs_xmm { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) #define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) -- Gitee From 5e7f50e175b65427668e129f194796ae95e8b064 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:42 +0000 Subject: [PATCH 22/40] perf/amd/ibs: Update DTLB/PageSize decode logic mainline inclusion from mainline- commit 0b347a4218da08b1eb400c259d193bff463dae87 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/0b347a4218da08b1eb400c259d193bff463dae87 -------------------------------- commit 0b347a4218da08b1eb400c259d193bff463dae87 upstream IBS Op PMU on Zen5 reports DTLB and page size information differently compared to prior generation. The change is enumerated by CPUID_Fn8000001B_EAX[19]. IBS_OP_DATA3 Zen3/4 Zen5 ---------------------------------------------------------------- 19 IbsDcL2TlbHit1G Reserved ---------------------------------------------------------------- 6 IbsDcL2tlbHit2M Reserved ---------------------------------------------------------------- 5 IbsDcL1TlbHit1G PageSize: 4 IbsDcL1TlbHit2M 0 - 4K 1 - 2M 2 - 1G 3 - Reserved Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 3 IbsDcL2TlbMiss IbsDcL2TlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 2 IbsDcL1tlbMiss IbsDcL1tlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- o Currently, only bit 2 and 3 are interpreted by IBS NMI handler for PERF_SAMPLE_DATA_SRC. Add dependency on IbsDcPhyAddrValid for those bits. o Introduce new IBS Op PMU capability and expose it to userspace via PMU's sysfs directory. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-3-ravi.bangoria@amd.com Signed-off-by: Malathi --- arch/x86/events/amd/ibs.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0aacedecf08a..64d7b125d0f6 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -627,6 +627,7 @@ PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); +PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -640,6 +641,12 @@ ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; } +static umode_t +ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; +} + static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, @@ -660,6 +667,11 @@ static struct attribute *ibs_op_ldlat_cap_attrs[] = { NULL, }; +static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { + &ibs_op_dtlb_pgsize_cap.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -683,6 +695,12 @@ static struct attribute_group group_ibs_op_ldlat_cap = { .is_visible = ibs_op_ldlat_is_visible, }; +static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { + .name = "caps", + .attrs = ibs_op_dtlb_pgsize_cap_attrs, + .is_visible = ibs_op_dtlb_pgsize_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, &empty_caps_group, @@ -740,6 +758,7 @@ static const struct attribute_group *op_attr_update[] = { &group_zen4_ibs_extensions, &group_ibs_op_ldlat_cap, &group_ibs_op_ldlat_format, + &group_ibs_op_dtlb_pgsize_cap, NULL, }; @@ -990,6 +1009,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, if (!op_data3->dc_lin_addr_valid) return; + if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && + !op_data3->dc_phy_addr_valid) + return; + if (!op_data3->dc_l1tlb_miss) { data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; return; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1f1c7e154913..0f9f8f31734c 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -472,6 +472,7 @@ struct pebs_xmm { #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ -- Gitee From 394d0fbaab97b16088dce0545294e1930a3e1edc Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:43 +0000 Subject: [PATCH 23/40] perf amd ibs: Sync arch/x86/include/asm/amd-ibs.h header with the kernel mainline inclusion from mainline- commit 3201bfa368fee5e70927e45222ff0b235352c01c category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/3201bfa368fee5e70927e45222ff0b235352c01c -------------------------------- commit 3201bfa368fee5e70927e45222ff0b235352c01c upstream Sync load latency related bit fields into the tool's header copy Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-4-ravi.bangoria@amd.com Signed-off-by: Malathi --- tools/arch/x86/include/asm/amd-ibs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 93807b437e4d..cb1740bc3da2 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; -- Gitee From f1999b4674bce1f483c9a4cc0017b8cf8ee86ef6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:30 -0700 Subject: [PATCH 24/40] x86/bugs: Rename entry_ibpb() to write_ibpb() mainline inclusion from mainline- commit 13235d6d50bba99931c4392c0f813cfae0de3eac category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/13235d6d50bba99931c4392c0f813cfae0de3eac -------------------------------- commit 13235d6d50bba99931c4392c0f813cfae0de3eac upstream There's nothing entry-specific about entry_ibpb(). In preparation for calling it from elsewhere, rename it to write_ibpb(). Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1e54ace131e79b760de3fe828264e26d0896e3ac.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty --- arch/x86/entry/entry.S | 7 ++++--- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 2143358d0c4c..e0ebd95434f7 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -14,7 +14,8 @@ .pushsection .noinstr.text, "ax" -SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) movl $MSR_IA32_PRED_CMD, %ecx movl $PRED_CMD_IBPB, %eax xorl %edx, %edx @@ -23,9 +24,9 @@ SYM_FUNC_START(entry_ibpb) /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET -SYM_FUNC_END(entry_ibpb) +SYM_FUNC_END(write_ibpb) /* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +EXPORT_SYMBOL_GPL(write_ibpb); .popsection diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ee642d26e304..8226a48a8ded 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -292,7 +292,7 @@ * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. @@ -302,7 +302,7 @@ VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - "call entry_ibpb", \ibpb_feature, \ + "call write_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm @@ -401,7 +401,7 @@ extern void srso_untrain_ret(void); extern void srso_alias_untrain_ret(void); extern void entry_untrain_ret(void); -extern void entry_ibpb(void); +extern void write_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7df458a6553e..fda31d2eb51b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1125,7 +1125,7 @@ static void __init retbleed_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2637,7 +2637,7 @@ static void __init srso_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2656,7 +2656,7 @@ static void __init srso_select_mitigation(void) srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ -- Gitee From 2074d4daf87f11caa6cebd57e17743c1719f8cae Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:31 -0700 Subject: [PATCH 25/40] x86/bugs: Use SBPB in write_ibpb() if applicable mainline inclusion from mainline- commit fc9fd3f98423367c79e0bd85a9515df26dc1b3cc category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/fc9fd3f98423367c79e0bd85a9515df26dc1b3cc -------------------------------- commit fc9fd3f98423367c79e0bd85a9515df26dc1b3cc upstream write_ibpb() does IBPB, which (among other things) flushes branch type predictions on AMD. If the CPU has SRSO_NO, or if the SRSO mitigation has been disabled, branch type flushing isn't needed, in which case the lighter-weight SBPB can be used. The 'x86_pred_cmd' variable already keeps track of whether IBPB or SBPB should be used. Use that instead of hardcoding IBPB. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/17c5dcd14b29199b75199d67ff7758de9d9a4928.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty --- arch/x86/entry/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index e0ebd95434f7..fff3c8ae9701 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -17,7 +17,7 @@ /* Clobbers AX, CX, DX */ SYM_FUNC_START(write_ibpb) movl $MSR_IA32_PRED_CMD, %ecx - movl $PRED_CMD_IBPB, %eax + movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx wrmsr -- Gitee From a48b2ad842c8f08018c8c716fe35b54b5248737b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:32 -0700 Subject: [PATCH 26/40] x86/bugs: Fix RSB clearing in indirect_branch_prediction_barrier() mainline inclusion from mainline- commit b1b19cfcf4656c75088dc06b7499f493e0dec3e5 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/b1b19cfcf4656c75088dc06b7499f493e0dec3e5 -------------------------------- commit b1b19cfcf4656c75088dc06b7499f493e0dec3e5 upstream IBPB is expected to clear the RSB. However, if X86_BUG_IBPB_NO_RET is set, that doesn't happen. Make indirect_branch_prediction_barrier() take that into account by calling write_ibpb() which clears RSB on X86_BUG_IBPB_NO_RET: /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET Note that, as of the previous patch, write_ibpb() also reads 'x86_pred_cmd' in order to use SBPB when applicable: movl _ASM_RIP(x86_pred_cmd), %eax Therefore that existing behavior in indirect_branch_prediction_barrier() is not lost. [Backport changes] In arch/x86/include/asm/nospec-branch.h, within the function indirect_branch_prediction_barrier(), current upstream commit b1b19cfcf465 replaced the call to alternative_msr_write() with an asm_inline volatile(ALTERNATIVE(...)) construct and use the feature flag from X86_FEATURE_USE_IBPB to X86_FEATURE_IBPB. However, since commit 549435aab49a, which renamed X86_FEATURE_USE_IBPB to X86_FEATURE_IBPB, is not present in the current source, the existing X86_FEATURE_USE_IBPB definition has been retained in this backport. This change preserves functionality while ensuring compatibility with the current codebase. Fixes: 50e4b3b94090 ("x86/entry: Have entry_ibpb() invalidate return predictions") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/bba68888c511743d4cd65564d1fc41438907523f.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty --- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8226a48a8ded..0ac0b8feb7af 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -545,11 +545,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } -extern u64 x86_pred_cmd; - static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_USE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index fda31d2eb51b..5e496b68a1ce 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -59,7 +59,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; -- Gitee From b20606d9656083fb4f23aad527dc54436d9c0ef6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:33 -0700 Subject: [PATCH 27/40] x86/bugs: Don't fill RSB on VMEXIT with eIBRS+retpoline mainline inclusion from mainline- commit 18bae0dfec15b24ec14ca17dc18603372f5f254f category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/18bae0dfec15b24ec14ca17dc18603372f5f254f -------------------------------- commit 18bae0dfec15b24ec14ca17dc18603372f5f254f upstream eIBRS protects against guest->host RSB underflow/poisoning attacks. Adding retpoline to the mix doesn't change that. Retpoline has a balanced CALL/RET anyway. So the current full RSB filling on VMEXIT with eIBRS+retpoline is overkill. Disable it or do the VMEXIT_LITE mitigation if needed. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Cc: Sean Christopherson Cc: David Woodhouse Link: https://lore.kernel.org/r/84a1226e5c9e2698eae1b5ade861f1b8bf3677dc.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty --- arch/x86/kernel/cpu/bugs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5e496b68a1ce..78e37011747c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1599,20 +1599,20 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ case SPECTRE_V2_NONE: return; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } return; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); return; } -- Gitee From 8b5a6cbddcfcd18c5ee290ae5821db834067f55a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:34 -0700 Subject: [PATCH 28/40] x86/bugs: Don't fill RSB on context switch with eIBRS mainline inclusion from mainline- commit 27ce8299bc1ec6df8306073785ff82b30b3cc5ee category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/27ce8299bc1ec6df8306073785ff82b30b3cc5ee -------------------------------- commit 27ce8299bc1ec6df8306073785ff82b30b3cc5ee upstream User->user Spectre v2 attacks (including RSB) across context switches are already mitigated by IBPB in cond_mitigation(), if enabled globally or if either the prev or the next task has opted in to protection. RSB filling without IBPB serves no purpose for protecting user space, as indirect branches are still vulnerable. User->kernel RSB attacks are mitigated by eIBRS. In which case the RSB filling on context switch isn't needed, so remove it. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/98cdefe42180358efebf78e3b80752850c7a3e1b.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty --- arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++------------ arch/x86/mm/tlb.c | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 78e37011747c..95e69794597b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1573,7 +1573,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* * Similar to context switches, there are two types of RSB attacks @@ -1597,7 +1597,7 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ */ switch (mode) { case SPECTRE_V2_NONE: - return; + break; case SPECTRE_V2_EIBRS: case SPECTRE_V2_EIBRS_LFENCE: @@ -1606,18 +1606,21 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } /* @@ -1843,10 +1846,7 @@ static void __init spectre_v2_select_mitigation(void) * * FIXME: Is this pointless for retbleed-affected AMD? */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 64f594826a28..51a7f74ae8aa 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -392,9 +392,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the -- Gitee From 97afc37a5230ea137aea7046771158c2d2111c17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:35 -0700 Subject: [PATCH 29/40] x86/bugs: Add RSB mitigation document mainline inclusion from mainline- commit 83f6665a49c3d44ad0c08f837d352dd290f5d10b category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/83f6665a49c3d44ad0c08f837d352dd290f5d10b -------------------------------- commit 83f6665a49c3d44ad0c08f837d352dd290f5d10b upstream Create a document to summarize hard-earned knowledge about RSB-related mitigations, with references, and replace the overly verbose yet incomplete comments with a reference to the document. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ab73f4659ba697a974759f07befd41ae605e33dd.1744148254.git.jpoimboe@kernel.org Signed-off-by: Malathi --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/rsb.rst | 268 ++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 64 +---- 3 files changed, 282 insertions(+), 51 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/rsb.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index ff0b440ef2dc..451874b8135d 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -22,3 +22,4 @@ are configurable at compile, boot or run time. srso gather_data_sampling reg-file-data-sampling + rsb diff --git a/Documentation/admin-guide/hw-vuln/rsb.rst b/Documentation/admin-guide/hw-vuln/rsb.rst new file mode 100644 index 000000000000..21dbf9cf25f8 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/rsb.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +RSB-related mitigations +======================= + +.. warning:: + Please keep this document up-to-date, otherwise you will be + volunteered to update it and convert it to a very long comment in + bugs.c! + +Since 2018 there have been many Spectre CVEs related to the Return Stack +Buffer (RSB) (sometimes referred to as the Return Address Stack (RAS) or +Return Address Predictor (RAP) on AMD). + +Information about these CVEs and how to mitigate them is scattered +amongst a myriad of microarchitecture-specific documents. + +This document attempts to consolidate all the relevant information in +once place and clarify the reasoning behind the current RSB-related +mitigations. It's meant to be as concise as possible, focused only on +the current kernel mitigations: what are the RSB-related attack vectors +and how are they currently being mitigated? + +It's *not* meant to describe how the RSB mechanism operates or how the +exploits work. More details about those can be found in the references +below. + +Rather, this is basically a glorified comment, but too long to actually +be one. So when the next CVE comes along, a kernel developer can +quickly refer to this as a refresher to see what we're actually doing +and why. + +At a high level, there are two classes of RSB attacks: RSB poisoning +(Intel and AMD) and RSB underflow (Intel only). They must each be +considered individually for each attack vector (and microarchitecture +where applicable). + +---- + +RSB poisoning (Intel and AMD) +============================= + +SpectreRSB +~~~~~~~~~~ + +RSB poisoning is a technique used by SpectreRSB [#spectre-rsb]_ where +an attacker poisons an RSB entry to cause a victim's return instruction +to speculate to an attacker-controlled address. This can happen when +there are unbalanced CALLs/RETs after a context switch or VMEXIT. + +* All attack vectors can potentially be mitigated by flushing out any + poisoned RSB entries using an RSB filling sequence + [#intel-rsb-filling]_ [#amd-rsb-filling]_ when transitioning between + untrusted and trusted domains. But this has a performance impact and + should be avoided whenever possible. + + .. DANGER:: + **FIXME**: Currently we're flushing 32 entries. However, some CPU + models have more than 32 entries. The loop count needs to be + increased for those. More detailed information is needed about RSB + sizes. + +* On context switch, the user->user mitigation requires ensuring the + RSB gets filled or cleared whenever IBPB gets written [#cond-ibpb]_ + during a context switch: + + * AMD: + On Zen 4+, IBPB (or SBPB [#amd-sbpb]_ if used) clears the RSB. + This is indicated by IBPB_RET in CPUID [#amd-ibpb-rsb]_. + + On Zen < 4, the RSB filling sequence [#amd-rsb-filling]_ must be + always be done in addition to IBPB [#amd-ibpb-no-rsb]_. This is + indicated by X86_BUG_IBPB_NO_RET. + + * Intel: + IBPB always clears the RSB: + + "Software that executed before the IBPB command cannot control + the predicted targets of indirect branches executed after the + command on the same logical processor. The term indirect branch + in this context includes near return instructions, so these + predicted targets may come from the RSB." [#intel-ibpb-rsb]_ + +* On context switch, user->kernel attacks are prevented by SMEP. User + space can only insert user space addresses into the RSB. Even + non-canonical addresses can't be inserted due to the page gap at the + end of the user canonical address space reserved by TASK_SIZE_MAX. + A SMEP #PF at instruction fetch prevents the kernel from speculatively + executing user space. + + * AMD: + "Finally, branches that are predicted as 'ret' instructions get + their predicted targets from the Return Address Predictor (RAP). + AMD recommends software use a RAP stuffing sequence (mitigation + V2-3 in [2]) and/or Supervisor Mode Execution Protection (SMEP) + to ensure that the addresses in the RAP are safe for + speculation. Collectively, we refer to these mitigations as "RAP + Protection"." [#amd-smep-rsb]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits." [#intel-smep-rsb]_ + +* On VMEXIT, guest->host attacks are mitigated by eIBRS (and PBRSB + mitigation if needed): + + * AMD: + "When Automatic IBRS is enabled, the internal return address + stack used for return address predictions is cleared on VMEXIT." + [#amd-eibrs-vmexit]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits. Processors with + enhanced IBRS still support the usage model where IBRS is set + only in the OS/VMM for OSes that enable SMEP. To do this, such + processors will ensure that guest behavior cannot control the + RSB after a VM exit once IBRS is set, even if IBRS was not set + at the time of the VM exit." [#intel-eibrs-vmexit]_ + + Note that some Intel CPUs are susceptible to Post-barrier Return + Stack Buffer Predictions (PBRSB) [#intel-pbrsb]_, where the last + CALL from the guest can be used to predict the first unbalanced RET. + In this case the PBRSB mitigation is needed in addition to eIBRS. + +AMD RETBleed / SRSO / Branch Type Confusion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On AMD, poisoned RSB entries can also be created by the AMD RETBleed +variant [#retbleed-paper]_ [#amd-btc]_ or by Speculative Return Stack +Overflow [#amd-srso]_ (Inception [#inception-paper]_). The kernel +protects itself by replacing every RET in the kernel with a branch to a +single safe RET. + +---- + +RSB underflow (Intel only) +========================== + +RSB Alternate (RSBA) ("Intel Retbleed") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Intel Skylake-generation CPUs are susceptible to the Intel variant +of RETBleed [#retbleed-paper]_ (Return Stack Buffer Underflow +[#intel-rsbu]_). If a RET is executed when the RSB buffer is empty due +to mismatched CALLs/RETs or returning from a deep call stack, the branch +predictor can fall back to using the Branch Target Buffer (BTB). If a +user forces a BTB collision then the RET can speculatively branch to a +user-controlled address. + +* Note that RSB filling doesn't fully mitigate this issue. If there + are enough unbalanced RETs, the RSB may still underflow and fall back + to using a poisoned BTB entry. + +* On context switch, user->user underflow attacks are mitigated by the + conditional IBPB [#cond-ibpb]_ on context switch which effectively + clears the BTB: + + * "The indirect branch predictor barrier (IBPB) is an indirect branch + control mechanism that establishes a barrier, preventing software + that executed before the barrier from controlling the predicted + targets of indirect branches executed after the barrier on the same + logical processor." [#intel-ibpb-btb]_ + +* On context switch and VMEXIT, user->kernel and guest->host RSB + underflows are mitigated by IBRS or eIBRS: + + * "Enabling IBRS (including enhanced IBRS) will mitigate the "RSBU" + attack demonstrated by the researchers. As previously documented, + Intel recommends the use of enhanced IBRS, where supported. This + includes any processor that enumerates RRSBA but not RRSBA_DIS_S." + [#intel-rsbu]_ + + However, note that eIBRS and IBRS do not mitigate intra-mode attacks. + Like RRSBA below, this is mitigated by clearing the BHB on kernel + entry. + + As an alternative to classic IBRS, call depth tracking (combined with + retpolines) can be used to track kernel returns and fill the RSB when + it gets close to being empty. + +Restricted RSB Alternate (RRSBA) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some newer Intel CPUs have Restricted RSB Alternate (RRSBA) behavior, +which, similar to RSBA described above, also falls back to using the BTB +on RSB underflow. The only difference is that the predicted targets are +restricted to the current domain when eIBRS is enabled: + +* "Restricted RSB Alternate (RRSBA) behavior allows alternate branch + predictors to be used by near RET instructions when the RSB is + empty. When eIBRS is enabled, the predicted targets of these + alternate predictors are restricted to those belonging to the + indirect branch predictor entries of the current prediction domain. + [#intel-eibrs-rrsba]_ + +When a CPU with RRSBA is vulnerable to Branch History Injection +[#bhi-paper]_ [#intel-bhi]_, an RSB underflow could be used for an +intra-mode BTI attack. This is mitigated by clearing the BHB on +kernel entry. + +However if the kernel uses retpolines instead of eIBRS, it needs to +disable RRSBA: + +* "Where software is using retpoline as a mitigation for BHI or + intra-mode BTI, and the processor both enumerates RRSBA and + enumerates RRSBA_DIS controls, it should disable this behavior." + [#intel-retpoline-rrsba]_ + +---- + +References +========== + +.. [#spectre-rsb] `Spectre Returns! Speculation Attacks using the Return Stack Buffer `_ + +.. [#intel-rsb-filling] "Empty RSB Mitigation on Skylake-generation" in `Retpoline: A Branch Target Injection Mitigation `_ + +.. [#amd-rsb-filling] "Mitigation V2-3" in `Software Techniques for Managing Speculation `_ + +.. [#cond-ibpb] Whether IBPB is written depends on whether the prev and/or next task is protected from Spectre attacks. It typically requires opting in per task or system-wide. For more details see the documentation for the ``spectre_v2_user`` cmdline option in Documentation/admin-guide/kernel-parameters.txt. + +.. [#amd-sbpb] IBPB without flushing of branch type predictions. Only exists for AMD. + +.. [#amd-ibpb-rsb] "Function 8000_0008h -- Processor Capacity Parameters and Extended Feature Identification" in `AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions `_. SBPB behaves the same way according to `this email `_. + +.. [#amd-ibpb-no-rsb] `Spectre Attacks: Exploiting Speculative Execution `_ + +.. [#intel-ibpb-rsb] "Introduction" in `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#amd-smep-rsb] "Existing Mitigations" in `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#intel-smep-rsb] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#amd-eibrs-vmexit] "Extended Feature Enable Register (EFER)" in `AMD64 Architecture Programmer's Manual Volume 2: System Programming `_ + +.. [#intel-eibrs-vmexit] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#intel-pbrsb] `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#retbleed-paper] `RETBleed: Arbitrary Speculative Code Execution with Return Instruction `_ + +.. [#amd-btc] `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#amd-srso] `Technical Update Regarding Speculative Return Stack Overflow `_ + +.. [#inception-paper] `Inception: Exposing New Attack Surfaces with Training in Transient Execution `_ + +.. [#intel-rsbu] `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#intel-ibpb-btb] `Indirect Branch Predictor Barrier' `_ + +.. [#intel-eibrs-rrsba] "Guidance for RSBU" in `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#bhi-paper] `Branch History Injection: On the Effectiveness of Hardware Mitigations Against Cross-Privilege Spectre-v2 Attacks `_ + +.. [#intel-bhi] `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ + +.. [#intel-retpoline-rrsba] "Retpoline" in `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 95e69794597b..f5536ce82a43 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1576,25 +1576,25 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: break; @@ -1808,44 +1808,6 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ spectre_v2_select_rsb_mitigation(mode); /* -- Gitee From b87d797fb21865a0c4cbab1e5dddcadd66dbd5c7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 18 Apr 2025 11:49:40 +0530 Subject: [PATCH 30/40] x86/cpu/amd: Fix workaround for erratum 1054 mainline inclusion from mainline- commit 263e55949d8902a6a09bdb92a1ab6a3f67231abe category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/263e55949d8902a6a09bdb92a1ab6a3f67231abe -------------------------------- commit 263e55949d8902a6a09bdb92a1ab6a3f67231abe upstream Erratum 1054 affects AMD Zen processors that are a part of Family 17h Models 00-2Fh and the workaround is to not set HWCR[IRPerfEn]. However, when X86_FEATURE_ZEN1 was introduced, the condition to detect unaffected processors was incorrectly changed in a way that the IRPerfEn bit gets set only for unaffected Zen 1 processors. Ensure that HWCR[IRPerfEn] is set for all unaffected processors. This includes a subset of Zen 1 (Family 17h Models 30h and above) and all later processors. Also clear X86_FEATURE_IRPERF on affected processors so that the IRPerfCount register is not used by other entities like the MSR PMU driver. Fixes: 232afb557835 ("x86/CPU/AMD: Add X86_FEATURE_ZEN1") Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/caa057a9d6f8ad579e2f1abaa71efbd5bd4eaf6d.1744956467.git.sandipan.das@amd.com Signed-off-by: Malathi --- arch/x86/kernel/cpu/amd.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 251e23fc1a7f..e971ba13691b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -984,6 +984,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); + + /* + * Turn off the Instructions Retired free counter on machines that are + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (c->x86_model < 0x30) { + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + clear_cpu_cap(c, X86_FEATURE_IRPERF); + } } static bool cpu_has_zenbleed_microcode(void) @@ -1163,13 +1173,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - /* - * Turn on the Instructions Retired free counter on machines not - * susceptible to erratum #1054 "Instructions Retired Performance - * Counter May Be Inaccurate". - */ - if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) + /* Enable the Instructions Retired free counter */ + if (cpu_has(c, X86_FEATURE_IRPERF)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); -- Gitee From 28ad0634c865880ff63a16697ae5b58830d59def Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: [PATCH 31/40] KVM: Introduce KVM_SET_USER_MEMORY_REGION2 mainline inclusion from mainline- commit bb58b90b1a8f753b582055adaf448214a8e22c31 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/bb58b90b1a8f753b582055adaf448214a8e22c31 -------------------------------- commit bb58b90b1a8f753b582055adaf448214a8e22c31 upstream Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- Documentation/virt/kvm/api.rst | 22 +++++++++++++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 +-- include/uapi/linux/kvm.h | 13 ++++++++ virt/kvm/kvm_main.c | 57 +++++++++++++++++++++++++++++----- 5 files changed, 87 insertions(+), 11 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c40b6a09ffd8..9048de854236 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6148,6 +6148,28 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG interface. No error will be returned, but the resulting offset will not be applied. +4.140 KVM_SET_USER_MEMORY_REGION2 +--------------------------------- + +:Capability: KVM_CAP_USER_MEMORY2 +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region2 (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ + __u64 pad[16]; + }; + +See KVM_SET_USER_MEMORY_REGION. + 5. The kvm_run structure ======================== diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 914153ae6e5f..ddc9dbcb7a8b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12650,7 +12650,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a5cebf76aaa5..27875ee0f5bd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1230,9 +1230,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 51da8fe01061..bbc4a05e4107 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1232,6 +1242,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_HISI_IPIV 798 #define KVM_CAP_ARM_VIRT_MSI_BYPASS 799 +#define KVM_CAP_USER_MEMORY2 231 #ifdef KVM_CAP_IRQ_ROUTING @@ -1544,6 +1555,8 @@ struct kvm_numa_info { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) #define KVM_LOAD_USER_DATA _IOW(KVMIO, 0x49, struct kvm_user_data) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9e351bce483e..70f80b90dff3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1579,7 +1579,15 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; @@ -1981,7 +1989,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2085,7 +2093,7 @@ int __kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { int r; @@ -2097,7 +2105,7 @@ int kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -4585,6 +4593,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: + case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: @@ -4840,6 +4849,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) return fd; } +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ + offsetof(struct kvm_userspace_memory_region2, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region2, field)); \ +} while (0) + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4862,15 +4879,39 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } + case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_userspace_memory_region2 mem; + unsigned long size; + + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); + size = sizeof(struct kvm_userspace_memory_region); + } else { + size = sizeof(struct kvm_userspace_memory_region2); + } + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { -- Gitee From 858a688bd1ce571b7400284843fd63e1bdae15bc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Nov 2023 06:47:51 -0400 Subject: [PATCH 32/40] fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure() mainline inclusion from mainline- commit 4f0b9194bc119a9850a99e5e824808e2f468c348 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/4f0b9194bc119a9850a99e5e824808e2f468c348 -------------------------------- commit 4f0b9194bc119a9850a99e5e824808e2f468c348 upstream The call to the inode_init_security_anon() LSM hook is not the sole reason to use anon_inode_getfile_secure() or anon_inode_getfd_secure(). For example, the functions also allow one to create a file with non-zero size, without needing a full-blown filesystem. In this case, you don't need a "secure" version, just unique inodes; the current name of the functions is confusing and does not explain well the difference with the more "standard" anon_inode_getfile() and anon_inode_getfd(). Of course, there is another side of the coin; neither io_uring nor userfaultfd strictly speaking need distinct inodes, and it is not that clear anymore that anon_inode_create_get{file,fd}() allow the LSM to intercept and block the inode's creation. If one was so inclined, anon_inode_getfile_secure() and anon_inode_getfd_secure() could be kept, using the shared inode or a new one depending on CONFIG_SECURITY. However, this is probably overkill, and potentially a cause of bugs in different configurations. Therefore, just add a comment to io_uring and userfaultfd explaining the choice of the function. While at it, remove the export for what is now anon_inode_create_getfd(). There is no in-tree module that uses it, and the old name is gone anyway. If anybody actually needs the symbol, they can ask or they can just use anon_inode_create_getfile(), which will be exported very soon for use in KVM. [Backport changes] 1. In file "io_uring/io_uring.c" function io_uring_get_file() retaines the changes made in commit 6fc19b3 and matches the latest changes made in upstream commit 09d1c6a, in which instead of returning variable assigned with older struct, function directly returns struct anon_inode_create_getfile which also reflects the final version. Suggested-by: Christian Brauner Reviewed-by: Christian Brauner Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- fs/anon_inodes.c | 50 ++++++++++++++++++++++++------------- fs/userfaultfd.c | 5 ++-- include/linux/anon_inodes.h | 4 +-- io_uring/io_uring.c | 3 ++- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24192a7667ed..42b02dc36474 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { struct inode *inode; struct file *file; @@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name, if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); - if (secure) { + if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); @@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name, EXPORT_SYMBOL_GPL(anon_inode_getfile); /** - * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new + * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the - * inode_init_security_anon() LSM hook. This - * allows for both the inode to have its own - * security context and for the LSM to enforce - * policy on the inode's creation. + * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a - * reference to it is not held. Returns the newly created file* or an error - * pointer. See the anon_inode_getfile() documentation for more information. + * reference to it is not held. + * + * Returns the newly created file* or an error pointer. */ -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) @@ -181,7 +188,7 @@ static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { int error, fd; struct file *file; @@ -192,7 +199,7 @@ static int __anon_inode_getfd(const char *name, fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - secure); + make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; @@ -231,10 +238,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, EXPORT_SYMBOL_GPL(anon_inode_getfd); /** - * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new + * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls - * the inode_init_security_anon() LSM hook. This allows the inode to have its - * own security context and for a LSM to reject creation of the inode. + * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -243,16 +249,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. + * + * Returns a newly created file descriptor or an error code. */ -int anon_inode_getfd_secure(const char *name, const struct file_operations *fops, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } -EXPORT_SYMBOL_GPL(anon_inode_getfd_secure); static int __init anon_inode_init(void) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d616b7777eef..eb46803cd512 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1083,7 +1083,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, { int fd; - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -2280,7 +2280,8 @@ static int new_userfaultfd(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 5deaddbd7927..93a5f16d03f3 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -15,13 +15,13 @@ struct inode; struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags); -int anon_inode_getfd_secure(const char *name, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8b15e9dc340f..83e9998955cd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3961,7 +3961,8 @@ static int io_uring_install_fd(struct file *file) */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); } -- Gitee From 120bbf2c263ab8d93216643b4edf282953217086 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:03 -0700 Subject: [PATCH 33/40] KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro mainline inclusion from mainline- commit 2333afa17af0f4b6651214ee17cfd5ae5f47787a category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/2333afa17af0f4b6651214ee17cfd5ae5f47787a -------------------------------- commit 2333afa17af0f4b6651214ee17cfd5ae5f47787a upstream Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of KVM_ADDRESS_SPACE_NUM. No functional change intended. Reviewed-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-22-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- arch/x86/include/asm/kvm_host.h | 1 - include/linux/kvm_host.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d27a280daf5c..43397faa0f62 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2155,7 +2155,6 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 27875ee0f5bd..0d74de960d0c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -709,7 +709,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#if KVM_ADDRESS_SPACE_NUM == 1 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; -- Gitee From f4517c7f392a50829213e41cf41d608a805d4557 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:01 -0700 Subject: [PATCH 34/40] KVM: x86: Disallow hugepages when memory attributes are mixed mainline inclusion from mainline- commit 90b4fe17981e155432c4dbc490606d0c2e9c2199 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/90b4fe17981e155432c4dbc490606d0c2e9c2199 -------------------------------- commit 90b4fe17981e155432c4dbc490606d0c2e9c2199 upstream Disallow creating hugepages with mixed memory attributes, e.g. shared versus private, as mapping a hugepage in this case would allow the guest to access memory with the wrong attributes, e.g. overlaying private memory with a shared hugepage. Tracking whether or not attributes are mixed via the existing disallow_lpage field, but use the most significant bit in 'disallow_lpage' to indicate a hugepage has mixed attributes instead using the normal refcounting. Whether or not attributes are mixed is binary; either they are or they aren't. Attempting to squeeze that info into the refcount is unnecessarily complex as it would require knowing the previous state of the mixed count when updating attributes. Using a flag means KVM just needs to ensure the current status is reflected in the memslots. Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-20-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/mmu/mmu.c | 154 +++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 4 + 3 files changed, 159 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 43397faa0f62..6aea7cf7ac18 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1870,6 +1870,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1abee3f522d2..bea05bc6fa87 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -795,16 +795,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +/* + * The most significant bit in disallow_lpage tracks whether or not memory + * attributes are mixed, i.e. not identical for all gfns at the current level. + * The lower order bits are used to refcount other cases where a hugepage is + * disallowed, e.g. if KVM has shadow a page table at the gfn. + */ +#define KVM_LPAGE_MIXED_FLAG BIT(31) + static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; - int i; + int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); + + old = linfo->disallow_lpage; linfo->disallow_lpage += count; - WARN_ON_ONCE(linfo->disallow_lpage < 0); + WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } @@ -7170,3 +7180,143 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) if (kvm->arch.nx_huge_page_recovery_thread) kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } + +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long attrs) +{ + const unsigned long start = gfn; + const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); + + if (level == PG_LEVEL_2M) + return kvm_range_has_memory_attributes(kvm, start, end, attrs); + + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { + if (hugepage_test_mixed(slot, gfn, level - 1) || + attrs != kvm_get_memory_attributes(kvm, gfn)) + return false; + } + return true; +} + +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + unsigned long attrs = range->arg.attributes; + struct kvm_memory_slot *slot = range->slot; + int level; + + lockdep_assert_held_write(&kvm->mmu_lock); + lockdep_assert_held(&kvm->slots_lock); + + /* + * Calculate which ranges can be mapped with hugepages even if the slot + * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over + * a range that has PRIVATE GFNs, and conversely converting a range to + * SHARED may now allow hugepages. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + /* + * The sequence matters here: upper levels consume the result of lower + * level's scanning. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn = gfn_round_for_level(range->start, level); + + /* Process the head page if it straddles the range. */ + if (gfn != range->start || gfn + nr_pages > range->end) { + /* + * Skip mixed tracking if the aligned gfn isn't covered + * by the memslot, KVM can't use a hugepage due to the + * misaligned address regardless of memory attributes. + */ + if (gfn >= slot->base_gfn) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + gfn += nr_pages; + } + + /* + * Pages entirely covered by the range are guaranteed to have + * only the attributes which were just set. + */ + for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) + hugepage_clear_mixed(slot, gfn, level); + + /* + * Process the last tail page if it straddles the range and is + * contained by the memslot. Like the head page, KVM can't + * create a hugepage if the slot size is misaligned. + */ + if (gfn < range->end && + (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } + return false; +} + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + int level; + + if (!kvm_arch_has_private_mem(kvm)) + return; + + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + /* + * Don't bother tracking mixed attributes for pages that can't + * be huge due to alignment, i.e. process only pages that are + * entirely contained by the memslot. + */ + gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); + gfn_t start = gfn_round_for_level(slot->base_gfn, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn; + + if (start < slot->base_gfn) + start += nr_pages; + + /* + * Unlike setting attributes, every potential hugepage needs to + * be manually checked as the attributes may already be mixed. + */ + for (gfn = start; gfn < end; gfn += nr_pages) { + unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); + + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } +} +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ddc9dbcb7a8b..9bbc3866190c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12800,6 +12800,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + kvm_mmu_init_memslot_memory_attributes(kvm, slot); +#endif + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; -- Gitee From ffa997b492479d31dba11d27321937bee33047f9 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Wed, 29 May 2024 19:36:05 +0100 Subject: [PATCH 35/40] x86/cpu/intel: Drop stray FAM6 check with new Intel CPU model defines mainline inclusion from mainline- commit 34b3fc558b537bdf99644dcde539e151716f6331 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/34b3fc558b537bdf99644dcde539e151716f6331 -------------------------------- commit 34b3fc558b537bdf99644dcde539e151716f6331 upstream The outer if () should have been dropped when switching to c->x86_vfm. Fixes: 6568fc18c2f6 ("x86/cpu/intel: Switch to new Intel CPU model defines") Signed-off-by: Andrew Cooper Signed-off-by: Borislav Petkov (AMD) Acked-by: Tony Luck Link: https://lore.kernel.org/r/20240529183605.17520-1-andrew.cooper3@citrix.com Signed-off-by: Arukonda Rahul --- arch/x86/kernel/cpu/intel.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 92a4d9e88b07..cefa6f7041f1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -313,17 +313,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ - if (c->x86 == 6) { - switch (c->x86_vfm) { - case INTEL_ATOM_SALTWELL_MID: - case INTEL_ATOM_SALTWELL_TABLET: - case INTEL_ATOM_SILVERMONT_MID: - case INTEL_ATOM_AIRMONT_NP: - set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); - break; - default: - break; - } + switch (c->x86_vfm) { + case INTEL_ATOM_SALTWELL_MID: + case INTEL_ATOM_SALTWELL_TABLET: + case INTEL_ATOM_SILVERMONT_MID: + case INTEL_ATOM_AIRMONT_NP: + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; } /* -- Gitee From e46775e4d6c2fdf2924c8798b95c9d91cff8992b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:40 -0800 Subject: [PATCH 36/40] perf/core: Export perf_exclude_event() mainline inclusion from mainline- commit 6057b90ecc84f232dd32a047a086a4c4c271765f category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/6057b90ecc84f232dd32a047a086a4c4c271765f -------------------------------- commit 6057b90ecc84f232dd32a047a086a4c4c271765f upstream While at it, rename the same function in s390 cpum_sf PMU. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Acked-by: Thomas Richter Link: https://lore.kernel.org/r/20241203180441.1634709-2-namhyung@kernel.org Signed-off-by: Arukonda Rahul --- arch/s390/kernel/perf_cpum_sf.c | 6 +++--- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 3 +-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index e52c89739bc9..928333a0a8cf 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1074,7 +1074,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -1083,7 +1083,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1166,7 +1166,7 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; if (perf_event_overflow(event, &data, ®s)) { overflow = 1; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 826fb16906fe..d0a15530cf9b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1689,6 +1689,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1864,6 +1866,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/kernel/events/core.c b/kernel/events/core.c index 922389a83c67..3b8d02df001a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9801,8 +9801,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; -- Gitee From b6c8d98f76c64ff9eaa9c528a144021c55774797 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:41 -0800 Subject: [PATCH 37/40] perf/x86: Relax privilege filter restriction on AMD IBS mainline inclusion from mainline- commit d29e744c71673a71da8f8522799ee02744cad6c9 category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/d29e744c71673a71da8f8522799ee02744cad6c9 -------------------------------- commit d29e744c71673a71da8f8522799ee02744cad6c9 upstream While IBS is available for per-thread profiling, still regular users cannot open an event due to the default paranoid setting (2) which doesn't allow unprivileged users to get kernel samples. That means it needs to set exclude_kernel bit in the attribute but IBS driver would reject it since it has PERF_PMU_CAP_NO_EXCLUDE. This is not what we want and I've been getting requests to fix this issue. This should be done in the hardware, but until we get the HW fix we may allow exclude_{kernel,user,hv} in the attribute and silently drop the samples in the PMU IRQ handler. It won't guarantee the sampling frequency or even it'd miss some with fixed period too. Not ideal, but that'd still be helpful to regular users. To minimize the confusion, let's add 'swfilt' bit to attr.config2 which is exposed in the sysfs format directory so that users can figure out if the kernel support the privilege filters by software. $ perf record -e ibs_op/swfilt=1/u true This uses perf_exclude_event() which checks regs->cs. But it should be fine because set_linear_ip() also updates the CS according to the RIP provided by IBS. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241203180441.1634709-3-namhyung@kernel.org Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 59 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 64d7b125d0f6..69463e94be82 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,6 +28,8 @@ static u32 ibs_caps; #include #include +/* attr.config2 */ +#define IBS_SW_FILTER_MASK 1 /* * IBS states: @@ -296,6 +298,16 @@ static int perf_ibs_init(struct perf_event *event) if (has_branch_stack(event)) return -EOPNOTSUPP; + /* handle exclude_{user,kernel} in the IRQ handler */ + if (event->attr.exclude_host || event->attr.exclude_guest || + event->attr.exclude_idle) + return -EINVAL; + + if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && + (event->attr.exclude_kernel || event->attr.exclude_user || + event->attr.exclude_hv)) + return -EINVAL; + ret = validate_group(event); if (ret) return ret; @@ -604,24 +616,14 @@ static struct attribute *attrs_empty[] = { NULL, }; -static struct attribute_group empty_format_group = { - .name = "format", - .attrs = attrs_empty, -}; - static struct attribute_group empty_caps_group = { .name = "caps", .attrs = attrs_empty, }; -static const struct attribute_group *empty_attr_groups[] = { - &empty_format_group, - &empty_caps_group, - NULL, -}; - PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); @@ -647,8 +649,9 @@ ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; } -static struct attribute *rand_en_attrs[] = { +static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, + &format_attr_swfilt.attr, NULL, }; @@ -672,9 +675,9 @@ static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { NULL, }; -static struct attribute_group group_rand_en = { +static struct attribute_group group_fetch_formats = { .name = "format", - .attrs = rand_en_attrs, + .attrs = fetch_attrs, }; static struct attribute_group group_fetch_l3missonly = { @@ -702,7 +705,7 @@ static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { }; static const struct attribute_group *fetch_attr_groups[] = { - &group_rand_en, + &group_fetch_formats, &empty_caps_group, NULL, }; @@ -719,6 +722,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; } +static struct attribute *op_attrs[] = { + &format_attr_swfilt.attr, + NULL, +}; + static struct attribute *cnt_ctl_attrs[] = { &format_attr_cnt_ctl.attr, NULL, @@ -734,6 +742,11 @@ static struct attribute *ibs_op_ldlat_format_attrs[] = { NULL, }; +static struct attribute_group group_op_formats = { + .name = "format", + .attrs = op_attrs, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -752,6 +765,12 @@ static struct attribute_group group_ibs_op_ldlat_format = { .is_visible = ibs_op_ldlat_is_visible, }; +static const struct attribute_group *op_attr_groups[] = { + &group_op_formats, + &empty_caps_group, + NULL, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, @@ -772,7 +791,6 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, @@ -798,7 +816,6 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, @@ -1250,6 +1267,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && + perf_exclude_event(event, ®s)) { + throttle = perf_event_account_interrupt(event); + goto out; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ .frag = { @@ -1372,7 +1395,7 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_ZEN4) perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; - perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_groups = op_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); -- Gitee From b38a2219eba322ceafcf5352536f12653b9ce89e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 17 Mar 2025 09:37:55 -0700 Subject: [PATCH 38/40] perf/x86: Check data address for IBS software filter mainline inclusion from mainline- commit 65a99264f5e5a2bcc8c905f7b2d633e8991672ac category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/65a99264f5e5a2bcc8c905f7b2d633e8991672ac -------------------------------- commit 65a99264f5e5a2bcc8c905f7b2d633e8991672ac upstream The IBS software filter is filtering kernel samples for regular users in the PMI handler. It checks the instruction address in the IBS register to determine if it was in kernel mode or not. But it turns out that it's possible to report a kernel data address even if the instruction address belongs to user-space. Matteo Rizzo found that when an instruction raises an exception, IBS can report some kernel data addresses like IDT while holding the faulting instruction's RIP. To prevent an information leak, it should double check if the data address in PERF_SAMPLE_DATA is in the kernel space as well. [ mingo: Clarified the changelog ] Suggested-by: Matteo Rizzo Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250317163755.1842589-1-namhyung@kernel.org Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 69463e94be82..aa12f2c10ba6 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1267,8 +1267,13 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && - perf_exclude_event(event, ®s)) { + (perf_exclude_event(event, ®s) || + ((data.sample_flags & PERF_SAMPLE_ADDR) && + event->attr.exclude_kernel && kernel_ip(data.addr)))) { throttle = perf_event_account_interrupt(event); goto out; } @@ -1283,9 +1288,6 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_raw_data(&data, event, &raw); } - if (perf_ibs == &perf_ibs_op) - perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); - /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- Gitee From ef3137639327f61e5f2f3842c39abe4583fa58ea Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 22 Mar 2025 08:13:01 +0100 Subject: [PATCH 39/40] perf/amd/ibs: Prevent leaking sensitive data to userspace mainline inclusion from mainline- commit 50a53b60e141d7e31368a87e222e4dd5597bd4ae category: feature bugzilla: CVE: NA Reference: https://github.com/torvalds/linux/commit/50a53b60e141d7e31368a87e222e4dd5597bd4ae -------------------------------- commit 50a53b60e141d7e31368a87e222e4dd5597bd4ae upstream Although IBS "swfilt" can prevent leaking samples with kernel RIP to the userspace, there are few subtle cases where a 'data' address and/or a 'branch target' address can fall under kernel address range although RIP is from userspace. Prevent leaking kernel 'data' addresses by discarding such samples when {exclude_kernel=1,swfilt=1}. IBS can now be invoked by unprivileged user with the introduction of "swfilt". However, this creates a loophole in the interface where an unprivileged user can get physical address of the userspace virtual addresses through IBS register raw dump (PERF_SAMPLE_RAW). Prevent this as well. This upstream commit fixed the most obvious leak: 65a99264f5e5 perf/x86: Check data address for IBS software filter Follow that up with a more complete fix. Fixes: d29e744c7167 ("perf/x86: Relax privilege filter restriction on AMD IBS") Suggested-by: Matteo Rizzo Co-developed-by: Ravi Bangoria Signed-off-by: Namhyung Kim Signed-off-by: Ravi Bangoria Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250321161251.1033-1-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul --- arch/x86/events/amd/ibs.c | 84 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index aa12f2c10ba6..6269f551abbe 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1054,6 +1054,8 @@ static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, data_src->mem_lock = PERF_MEM_LOCK_LOCKED; } +/* Be careful. Works only for contiguous MSRs. */ +#define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, @@ -1159,6 +1161,67 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, return 1; } +static bool perf_ibs_is_kernel_data_addr(struct perf_event *event, + struct perf_ibs_data *ibs_data) +{ + u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW; + union ibs_op_data3 op_data3; + u64 dc_lin_addr; + + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + + return unlikely((event->attr.sample_type & sample_type_mask) && + op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr)); +} + +static bool perf_ibs_is_kernel_br_target(struct perf_event *event, + struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + union ibs_op_data op_data; + u64 br_target; + + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + br_target = ibs_data->regs[br_target_idx]; + + return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) && + op_data.op_brn_ret && kernel_ip(br_target)); +} + +static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event, + struct pt_regs *regs, struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + if (perf_exclude_event(event, regs)) + return true; + + if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel) + return false; + + if (perf_ibs_is_kernel_data_addr(event, ibs_data)) + return true; + + if (br_target_idx != -1 && + perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx)) + return true; + + return false; +} + +static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs, + struct perf_ibs_data *ibs_data) +{ + if (perf_ibs == &perf_ibs_op) { + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18); + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0; + return; + } + + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52); + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -1171,6 +1234,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, *config, period, new_config = 0; + int br_target_idx = -1; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -1241,6 +1305,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_BRNTRGT) { rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + br_target_idx = size; size++; } if (ibs_caps & IBS_CAPS_OPDATA4) { @@ -1267,16 +1332,20 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } - if (perf_ibs == &perf_ibs_op) - perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); - if ((event->attr.config2 & IBS_SW_FILTER_MASK) && - (perf_exclude_event(event, ®s) || - ((data.sample_flags & PERF_SAMPLE_ADDR) && - event->attr.exclude_kernel && kernel_ip(data.addr)))) { + perf_ibs_swfilt_discard(perf_ibs, event, ®s, &ibs_data, br_target_idx)) { throttle = perf_event_account_interrupt(event); goto out; } + /* + * Prevent leaking physical addresses to unprivileged users. Skip + * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for + * unprivileged users. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_allow_kernel(&event->attr)) { + perf_ibs_phyaddr_clear(perf_ibs, &ibs_data); + } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ @@ -1288,6 +1357,9 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_raw_data(&data, event, &raw); } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- Gitee From 429c63f0f76f1a7ad28fd38851ee13b88b419cd5 Mon Sep 17 00:00:00 2001 From: PvsNarasimha Date: Thu, 25 Sep 2025 18:20:46 +0800 Subject: [PATCH 40/40] config: enable CONFIG_X86_BUS_LOCK_DETECT by default on x86 amd inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ -------------------------------- This commit updates the openeuler_defconfig file to include necessary configuration options that were missing, causing CI test failures. The modifications ensure compatibility with the AMD Genoa and Turin platforms, enabling the features required for proper functionality. Key changes: * Added CONFIG_X86_BUS_LOCK_DETECT This configuration Enables managed host access to the bus lock detect. These changes resolve the CI test failures and ensure that the kernel is properly configured for the AMD Genoa and Turin platforms. Signed-off-by: PvsNarasimha --- arch/x86/configs/openeuler_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 16b5e86ba6fb..6b9268aa355c 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -540,6 +540,8 @@ CONFIG_LIVEPATCH_RESTRICT_KPROBE=y CONFIG_LIVEPATCH_ISOLATE_KPROBE=y CONFIG_LIVEPATCH_LONG_SYMBOL_SUPPORT=y # end of Enable Livepatch + +CONFIG_X86_BUS_LOCK_DETECT=y # end of Processor type and features CONFIG_FUNCTION_PADDING_CFI=11 -- Gitee