From 75d0f65e77fde4ad0dd87adea1e3b89cf938df0d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 10 Dec 2020 11:09:36 -0600 Subject: [PATCH 01/24] x86/cpu: Add VM page flush MSR availablility as a CPUID feature mainline inclusion from mainline-v5.11 commit 69372cf01290b9587d2cee8fbe161d75d55c3adc category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- On systems that do not have hardware enforced cache coherency between encrypted and unencrypted mappings of the same physical page, the hypervisor can use the VM page flush MSR (0xc001011e) to flush the cache contents of an SEV guest page. When a small number of pages are being flushed, this can be used in place of issuing a WBINVD across all CPUs. CPUID 0x8000001f_eax[2] is used to determine if the VM page flush MSR is available. Add a CPUID feature to indicate it is supported and define the MSR. Signed-off-by: Tom Lendacky Message-Id: Signed-off-by: Paolo Bonzini Signed-off-by: Xie Haocheng --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8e93fde24a7a..8b94c5ca8a28 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -237,6 +237,7 @@ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c4918ef0c916..58c10665d431 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -498,6 +498,7 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ #define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b +#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 839b54a08e09..fea0df867d18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -46,6 +46,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, + { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; -- Gitee From 935483c5fafbccfda1814e412ac13c4a93e47920 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jan 2021 12:40:46 -0800 Subject: [PATCH 02/24] x86/cpufeatures: Assign dedicated feature word for CPUID_0x8000001F[EAX] mainline inclusion from mainline-v5.12 commit fb35d30fe5b06cc24444f0405da8fbe0be5330d1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Collect the scattered SME/SEV related feature flags into a dedicated word. There are now five recognized features in CPUID.0x8000001F.EAX, with at least one more on the horizon (SEV-SNP). Using a dedicated word allows KVM to use its automagic CPUID adjustment logic when reporting the set of supported features to userspace. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov Reviewed-by: Brijesh Singh Link: https://lkml.kernel.org/r/20210122204047.2860075-2-seanjc@google.com Signed-off-by: Xie Haocheng --- arch/x86/include/asm/cpufeature.h | 7 +++++-- arch/x86/include/asm/cpufeatures.h | 17 +++++++++++------ arch/x86/include/asm/disabled-features.h | 3 ++- arch/x86/include/asm/required-features.h | 3 ++- arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/scattered.c | 5 ----- tools/arch/x86/include/asm/disabled-features.h | 3 ++- tools/arch/x86/include/asm/required-features.h | 3 ++- 8 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 59bf91c57aa8..1728d4ce5730 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8b94c5ca8a28..3ae98156ef98 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -236,8 +236,6 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -394,6 +392,13 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 018654e65c60..8f28fafa98b3 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -88,6 +88,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 3ff0d48469f2..b2d504f11937 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc463b008752..f243f577a928 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -957,6 +957,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fea0df867d18..21d1f062895a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -42,11 +42,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, - { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 5861d34f9771..2216077676c8 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -85,6 +85,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 3ff0d48469f2..b2d504f11937 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- Gitee From d2126e1004d15acc945a1305bf82eff7e4e98d04 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:05 -0700 Subject: [PATCH 03/24] perf/core: Add perf_clear_branch_entry_bitfields() helper mainline inclusion from mainline-v5.19 commit bfe4daf850f45d92dcd3da477f0b0456620294c3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Make it simpler to reset all the info fields on the perf_branch_entry by adding a helper inline function. The goal is to centralize the initialization to avoid missing a field in case more are added. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-2-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/intel/lbr.c | 36 +++++++++++++++++------------------- include/linux/perf_event.h | 16 ++++++++++++++++ 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 1d517fd08542..a6876a7f6738 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -786,6 +786,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -801,15 +802,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -825,6 +822,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -896,15 +894,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -966,6 +963,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -974,7 +973,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb2e6c0f1b22..077017273e97 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1062,6 +1062,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, -- Gitee From cc624af3b717f0f1b42641972dd0b695e1eb3ee9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:06 -0700 Subject: [PATCH 04/24] x86/cpufeatures: Add AMD Fam19h Branch Sampling feature mainline inclusion from mainline-v5.19 commit a77d41ac3a0f41c80120ec5b8b08ab284fec950a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add a cpu feature for AMD Fam19h Branch Sampling feature as bit 31 of EBX on CPUID leaf function 0x80000008. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-3-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 3ae98156ef98..0a8016b8a402 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -312,6 +312,7 @@ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -- Gitee From 53d570a9a915c0b0c7ecf22c496f6b2846cea87e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:07 -0700 Subject: [PATCH 05/24] perf/x86/amd: Add AMD Fam19h Branch Sampling support mainline inclusion from mainline-v5.19 commit ada543459cab7f653dcacdaba4011a8bb19c627c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add support for the AMD Fam19h 16-deep branch sampling feature as described in the AMD PPR Fam19h Model 01h Revision B1. This is a model specific extension. It is not an architected AMD feature. The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR registers. There is no branch type filtering. All control flow changes are captured. BRS relies on specific programming of the core PMU of Fam19h. In particular, the following requirements must be met: - the sampling period be greater than 16 (BRS depth) - the sampling period must use a fixed and not frequency mode BRS interacts with the NMI interrupt as well. Because enabling BRS is expensive, it is only activated after P event occurrences, where P is the desired sampling period. At P occurrences of the event, the counter overflows, the CPU catches the interrupt, activates BRS for 16 branches until it saturates, and then delivers the NMI to the kernel. Between the overflow and the time BRS activates more branches may be executed skewing the period. All along, the sampling event keeps counting. The skid may be attenuated by reducing the sampling period by 16 (subsequent patch). BRS is integrated into perf_events seamlessly via the same PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry records in the sampling buffer. No prediction information is supported. The branches are stored in reverse order of execution. The most recent branch is the first entry in each record. No modification to the perf tool is necessary. BRS can be used with any sampling event. However, it is recommended to use the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS captures. $ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test $ perf report -D 56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0 ... branch stack: nr:16 ..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0 ..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0 ..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0 ..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0 ..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0 ..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0 ..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0 ..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0 ..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0 ..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0 ..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0 ..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0 ..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0 ..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0 ..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0 ..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0 ... thread: test:18230 ...... dso: test Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/brs.c | 317 +++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 233 ++++++++++++++++++++++- arch/x86/events/core.c | 10 +- arch/x86/events/perf_event.h | 101 ++++++++-- arch/x86/include/asm/msr-index.h | 4 + 6 files changed, 645 insertions(+), 22 deletions(-) create mode 100644 arch/x86/events/amd/brs.c diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index fe8795a67385..a640f67da987 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o ifdef CONFIG_AMD_IOMMU diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..3c13c484c637 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 39eb276d0277..84d80214c17f 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -325,8 +325,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +351,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +433,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -555,6 +622,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -610,6 +679,8 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; + amd_brs_disable_all(); + x86_pmu_disable_all(); /* @@ -634,6 +705,30 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +746,18 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -671,11 +778,31 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -897,6 +1024,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1077,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1099,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,6 +1119,37 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { u64 even_ctr_mask = 0ULL; @@ -989,6 +1201,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e0f8b219046e..14f545fdfc6b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1296,6 +1296,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1659,11 +1663,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 99436e913d19..f3bae2998632 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -66,22 +66,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x00080 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -323,6 +324,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -986,6 +989,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1081,6 +1089,50 @@ static inline bool fixed_counter_disabled(int i) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -1089,6 +1141,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 58c10665d431..42b437c6b2b9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -674,6 +674,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- Gitee From 6c5806e047c0e6043bc41511bf6141b9d532e01c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:08 -0700 Subject: [PATCH 06/24] perf/x86/amd: Add branch-brs helper event for Fam19h BRS mainline inclusion from mainline-v5.19 commit 44175993efbae04e8b2d7f7795ff512c3a726db0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add a pseudo event called branch-brs to help use the FAM Fam19h Branch Sampling feature (BRS). BRS samples taken branches, so it is best used when sampling on a retired taken branch event (0xc4) which is what BRS captures. Instead of trying to remember the event code or actual event name, users can simply do: $ perf record -b -e cpu/branch-brs/ -c 1000037 ..... Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-5-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 84d80214c17f..c0349766b092 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1145,8 +1145,23 @@ static struct attribute_group group_caps_amd_brs = { .is_visible = amd_brs_is_visible, }; +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + static const struct attribute_group *amd_attr_update[] = { &group_caps_amd_brs, + &group_events_amd_brs, NULL, }; -- Gitee From fbeacba301033893999d486eef91d22395810ae7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:09 -0700 Subject: [PATCH 07/24] perf/x86/amd: Enable branch sampling priv level filtering mainline inclusion from mainline-v5.19 commit 8910075d61a37e5b0d82e6c83ed9a0a31fe9ea08 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- The AMD Branch Sampling features does not provide hardware filtering by privilege level. The associated PMU counter does but not the branch sampling by itself. Given how BRS operates there is a possibility that BRS captures kernel level branches even though the event is programmed to count only at the user level. Implement a workaround in software by removing the branches which belong to the wrong privilege level. The privilege level is evaluated on the target of the branch and not the source so as to be compatible with other architectures. As a consequence of this patch, the number of entries in the PERF_RECORD_BRANCH_STACK buffer may be less than the maximum (16). It could even be zero. Another consequence is that consecutive entries in the branch stack may not reflect actual code path and may have discontinuities, in case kernel branches were suppressed. But this is no different than what happens on other architectures. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-6-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/brs.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 3c13c484c637..40461c3ce714 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -92,10 +92,6 @@ int amd_brs_setup_filter(struct perf_event *event) if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) return -EINVAL; - /* can only capture at all priv levels due to the way BRS works */ - if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) - return -EINVAL; - return 0; } @@ -195,6 +191,21 @@ void amd_brs_disable_all(void) amd_brs_disable(); } +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + /* * Caller must ensure amd_brs_inuse() is true before calling * return: @@ -252,8 +263,6 @@ void amd_brs_drain(void) if (to == BRS_POISON) break; - rdmsrl(brs_from(brs_idx), from); - /* * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. * Necessary to generate proper virtual addresses suitable for @@ -261,6 +270,11 @@ void amd_brs_drain(void) */ to = (u64)(((s64)to << shift) >> shift); + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + perf_clear_branch_entry_bitfields(br+nr); br[nr].from = from; -- Gitee From a6e05fc17ad3c4467b92a379aa49a06402edab54 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:10 -0700 Subject: [PATCH 08/24] perf/x86/amd: Add AMD branch sampling period adjustment mainline inclusion from mainline-v5.19 commit ba2fe7500845a30fc845a72081999cf632051862 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add code to adjust the sampling event period when used with the Branch Sampling feature (BRS). Given the depth of the BRS (16), the period is reduced by that depth such that in the best case scenario, BRS saturates at the desired sampling period. In practice, though, the processor may execute more branches. Given a desired period P and a depth D, the kernel programs the actual period at P - D. After P occurrences of the sampling event, the counter overflows. It then may take X branches (skid) before the NMI is caught and held by the hardware and BRS activates. Then, after D branches, BRS saturates and the NMI is delivered. With no skid, the effective period would be (P - D) + D = P. In practice, however, it will likely be (P - D) + X + D. There is no way to eliminate X or predict X. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-7-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/core.c | 7 +++++++ arch/x86/events/perf_event.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 14f545fdfc6b..648a92e6d013 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1332,6 +1332,13 @@ int x86_perf_event_set_period(struct perf_event *event) x86_pmu.set_topdown_event_period) return x86_pmu.set_topdown_event_period(event); + /* + * decrease period by the depth of the BRS feature to get + * the last N taken branches and approximate the desired period + */ + if (has_branch_stack(event)) + period = amd_brs_adjust_period(period); + /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f3bae2998632..fc06b2332d6d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1134,6 +1134,14 @@ static inline bool amd_brs_active(void) return cpuc->brs_active; } +static inline s64 amd_brs_adjust_period(s64 period) +{ + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; + + return period; +} + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -1158,6 +1166,10 @@ static inline void amd_brs_disable_all(void) { } +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) -- Gitee From abf604e4c99836b75cfbd956c54dbb01d492aa04 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:11 -0700 Subject: [PATCH 09/24] perf/x86/amd: Make Zen3 branch sampling opt-in mainline inclusion from mainline-v5.19 commit cc37e520a236069c0de0e7ea455082fa11c73b12 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add a kernel config option CONFIG_PERF_EVENTS_AMD_BRS to make the support for AMD Zen3 Branch Sampling (BRS) an opt-in compile time option. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-8-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/Kconfig | 7 ++++++ arch/x86/events/amd/Makefile | 3 ++- arch/x86/events/perf_event.h | 49 ++++++++++++++++++++++++++++-------- 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 39d9ded9e25a..b8c488eec07e 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -34,4 +34,11 @@ config PERF_EVENTS_AMD_POWER (CPUID Fn8000_0007_EDX[12]) interface to calculate the average power consumption on Family 15h processors. +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index a640f67da987..40c242ac9d0e 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o brs.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o ifdef CONFIG_AMD_IOMMU diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index fc06b2332d6d..3927f9549017 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1089,6 +1089,8 @@ static inline bool fixed_counter_disabled(int i) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1123,25 +1125,52 @@ static inline void amd_pmu_brs_del(struct perf_event *event) void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); -/* - * check if BRS is activated on the CPU - * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 - */ -static inline bool amd_brs_active(void) +static inline s64 amd_brs_adjust_period(s64 period) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; - return cpuc->brs_active; + return period; +} +#else +static inline int amd_brs_init(void) +{ + return 0; } +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_setup_filter(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} -static inline s64 amd_brs_adjust_period(s64 period) +static inline void amd_pmu_brs_add(struct perf_event *event) { - if (period > x86_pmu.lbr_nr) - return period - x86_pmu.lbr_nr; +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} +static inline s64 amd_brs_adjust_period(s64 period) +{ return period; } +static inline void amd_brs_enable_all(void) +{ +} + +#endif + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) -- Gitee From 5d94444412e6b8646edaef3aa115ade7f24ca019 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:12 -0700 Subject: [PATCH 10/24] ACPI: Add perf low power callback mainline inclusion from mainline-v5.19 commit 2a606a18cd672a16343d146a126721b34cc6adbd category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add an optional callback needed by some PMU features, e.g., AMD BRS, to give a chance to the perf_events code to change its state before a CPU goes to low power and after it comes back. The callback is void when the PERF_NEEDS_LOPWR_CB flag is not set. This flag must be set in arch specific perf_event.h header whenever needed. When not set, there is no impact on the ACPI code. Signed-off-by: Stephane Eranian [peterz: build fix] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-9-eranian@google.com Signed-off-by: Xie Haocheng --- drivers/acpi/acpi_pad.c | 7 +++++++ drivers/acpi/processor_idle.c | 5 +++++ include/linux/perf_event.h | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index b84ab722feb4..4472f7355abf 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -163,6 +164,9 @@ static int power_saving_thread(void *data) tsc_marked_unstable = 1; } local_irq_disable(); + + perf_lopwr_cb(true); + tick_broadcast_enable(); tick_broadcast_enter(); stop_critical_timings(); @@ -171,6 +175,9 @@ static int power_saving_thread(void *data) start_critical_timings(); tick_broadcast_exit(); + + perf_lopwr_cb(false); + local_irq_enable(); if (time_before(expire_time, jiffies)) { diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 9921b481c7ee..d07d46b07f91 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* @@ -551,6 +552,8 @@ static void wait_for_freeze(void) */ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) { + perf_lopwr_cb(true); + if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -561,6 +564,8 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) inb(cx->address); wait_for_freeze(); } + + perf_lopwr_cb(false); } /** diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 077017273e97..90ed4b08fae8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1624,4 +1624,11 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); + +#ifndef PERF_NEEDS_LOPWR_CB +static inline void perf_lopwr_cb(bool mode) +{ +} +#endif + #endif /* _LINUX_PERF_EVENT_H */ -- Gitee From 63f9030a34e58b479bf1fc290d89de772356f28f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:13 -0700 Subject: [PATCH 11/24] perf/x86/amd: Add idle hooks for branch sampling mainline inclusion from mainline-v5.19 commit d5616bac7adadbf42a3b63b8717e75eb82a2cc2c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- On AMD Fam19h Zen3, the branch sampling (BRS) feature must be disabled before entering low power and re-enabled (if was active) when returning from low power. Otherwise, the NMI interrupt may be held up for too long and cause problems. Stopping BRS will cause the NMI to be delivered if it was held up. Define a perf_amd_brs_lopwr_cb() callback to stop/restart BRS. The callback is protected by a jump label which is enabled only when AMD BRS is detected. In all other cases, the callback is never called. Signed-off-by: Stephane Eranian [peterz: static_call() and build fixes] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-10-eranian@google.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/brs.c | 33 +++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 4 ++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 23 +++++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 40461c3ce714..895c82165d85 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -7,6 +7,7 @@ * Contributed by Stephane Eranian */ #include +#include #include #include @@ -329,3 +330,35 @@ void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) if (sched_in) amd_brs_poison_buffer(); } + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c0349766b092..db7b9dcf4be4 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -1225,6 +1226,9 @@ static int __init amd_core_pmu_init(void) /* * put_event_constraints callback same as Fam17h, set above */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); } x86_pmu.attr_update = amd_attr_update; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3927f9549017..347de3d3df8b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1097,6 +1097,7 @@ void amd_brs_enable(void); void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); +void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); int amd_brs_setup_filter(struct perf_event *event); void amd_brs_reset(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d3530eac30bc..8d1e2a69b212 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include + /* * Performance event hw details: */ @@ -516,6 +518,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } -- Gitee From 00552473fa31fb90d887cdce947f83ab8b63c11f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:53 +0530 Subject: [PATCH 12/24] x86/cpufeatures: Add PerfMonV2 feature bit mainline inclusion from mainline-v5.19 commit d6d0c7f681fda1d07e005c8f653e578b77a0eb40 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Additionally, Core PMCs can be managed using new global control and status registers. For better utilization of feature words, PerfMonV2 is added as a scattered feature bit. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c70e497e22f18e7f05b025bb64ca21cc12b17792.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0a8016b8a402..76e0e04aa07d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 21d1f062895a..8c0d9b5426f7 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- Gitee From d85a90a8def844636a29aee013a12e645a8a8ee4 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:54 +0530 Subject: [PATCH 13/24] x86/msr: Add PerfCntrGlobal* registers mainline inclusion from mainline-v5.19 commit 089be16d5992dd0bc6df15ef12042fd1023ded9a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add MSR definitions that will be used to enable the new AMD Performance Monitoring Version 2 (PerfMonV2) features. These include: * Performance Counter Global Control (PerfCntrGlobalCtl) * Performance Counter Global Status (PerfCntrGlobalStatus) * Performance Counter Global Status Clear (PerfCntrGlobalStatusClr) The new Performance Counter Global Control and Status MSRs provide an interface for enabling or disabling multiple counters at the same time and for testing overflow without probing the individual registers for each PMC. The availability of these registers is indicated through the PerfMonV2 feature bit of CPUID leaf 0x80000022 EAX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cdc0d8f75bd519848731b5c64d924f5a0619a573.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 42b437c6b2b9..a709a063af88 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -508,6 +508,11 @@ #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -- Gitee From 8481998d2644e4c3e55e1a7ff333a66bd46fd8c1 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:55 +0530 Subject: [PATCH 14/24] perf/x86/amd/core: Detect PerfMonV2 support mainline inclusion from mainline-v5.19 commit 21d59e3e2c403c83ba196a5857d517054124168e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- AMD Performance Monitoring Version 2 (PerfMonV2) introduces some new Core PMU features such as detection of the number of available PMCs and managing PMCs using global registers namely, PerfCntrGlobalCtl and PerfCntrGlobalStatus. Clearing PerfCntrGlobalCtl and PerfCntrGlobalStatus ensures that all PMCs are inactive and have no pending overflows when CPUs are onlined or offlined. The PMU version (x86_pmu.version) now indicates PerfMonV2 support and will be used to bypass the new features on unsupported processors. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dc8672ecbddff394e088ca8abf94b089b8ecc2e7.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/core.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index db7b9dcf4be4..288f8d29bea1 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -19,6 +19,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -578,6 +581,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -625,6 +640,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->refcnt++; amd_brs_reset(); + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -644,6 +660,8 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); } /* @@ -1185,6 +1203,15 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. -- Gitee From 7d2a55464e55b13d608aa169fd2310940c1bb37f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:56 +0530 Subject: [PATCH 15/24] perf/x86/amd/core: Detect available counters mainline inclusion from mainline-v5.19 commit 56e026a7ca3f92b8e44359e1f705febd1833f701 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Core PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/68a6d9688df189267db26530378870edd34f7b06.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/core.c | 6 ++++++ arch/x86/include/asm/perf_event.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 288f8d29bea1..a52d07318654 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1186,6 +1186,7 @@ static const struct attribute_group *amd_attr_update[] = { static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -1206,9 +1207,14 @@ static int __init amd_core_pmu_init(void) /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + /* Update PMU version for later usage */ x86_pmu.version = 2; + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8d1e2a69b212..513b66da416b 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -186,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -367,6 +379,11 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + /* * IBS cpuid feature detection */ -- Gitee From d64e63c2559f56901c0607f385ca82de53ed8ff7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:57 +0530 Subject: [PATCH 16/24] perf/x86/amd/core: Add PerfMonV2 counter control mainline inclusion from mainline-v5.19 commit 9622e67e3980c01872490de0925e5c6c23247c94 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to manage the Core PMCs using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). Currently, all PMCs have dedicated control (PERF_CTL) and counter (PERF_CTR) registers. For a given PMC, the enable (En) bit of its PERF_CTL register is used to start or stop counting. The Performance Counter Global Control (PerfCntrGlobalCtl) register has enable (PerfCntrEn) bits for each PMC. For a PMC to start counting, both PERF_CTL and PerfCntrGlobalCtl enable bits must be set. If either of those are cleared, the PMC stops counting. In x86_pmu_{en,dis}able_all(), the PERF_CTL registers of all active PMCs are written to in a loop. Ideally, PMCs counting the same event that were started and stopped at the same time should record the same counts. Due to delays in between writes to the PERF_CTL registers across loop iterations, the PMCs cannot be enabled or disabled at the same instant and hence, record slightly different counts. This is fixed by enabling or disabling all active PMCs at the same time with a single write to the PerfCntrGlobalCtl register. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dfe8e934074aaabc6ba748dfaccd0a77c974bb82.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/core.c | 50 ++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a52d07318654..d71bc52fb949 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -664,6 +664,11 @@ static void amd_pmu_cpu_dead(int cpu) amd_pmu_cpu_reset(cpu); } +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -693,15 +698,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - amd_brs_disable_all(); - - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -748,6 +749,26 @@ static void amd_pmu_enable_all(int added) } } +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -765,6 +786,20 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -1216,6 +1251,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.num_counters = ebx.split.num_core_pmc; amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; } /* -- Gitee From ed1e688e67894e606cfdea14a2e150e89e68f411 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:58 +0530 Subject: [PATCH 17/24] perf/x86/amd/core: Add PerfMonV2 overflow handling mainline inclusion from mainline-v5.19 commit 7685665c390dc68c2d9a74e8445f41494cc8f6cf category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to process Core PMC overflows in the NMI handler using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). In x86_pmu_handle_irq(), overflows are detected by testing the contents of the PERF_CTR register for each active PMC in a loop. The new scheme instead inspects the overflow bits of the global status register. The Performance Counter Global Status (PerfCntrGlobalStatus) register has overflow (PerfCntrOvfl) bits for each PMC. This is, however, a read-only MSR. To acknowledge that overflows have been processed, the NMI handler must clear the bits by writing to the PerfCntrGlobalStatusClr register. In x86_pmu_handle_irq(), PMCs counting the same event that are started and stopped at the same time record slightly different counts due to delays in between reads from the PERF_CTR registers. This is fixed by stopping and starting the PMCs at the same before and with a single write to the Performance Counter Global Control (PerfCntrGlobalCtl) upon entering and before exiting the NMI handler. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/f20b7e4da0b0a83bdbe05857f354146623bc63ab.1650515382.git.sandipan.das@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/core.c | 144 ++++++++++++++++++++++++++++++++++--- 1 file changed, 133 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d71bc52fb949..95b68711f287 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../perf_event.h" @@ -669,6 +670,45 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -681,7 +721,6 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -689,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -830,6 +868,24 @@ static void amd_pmu_del_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static int amd_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -857,20 +913,84 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (pmu_enabled) amd_pmu_enable_all(0); + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + /* - * If a counter was handled, record a timestamp such that un-handled - * NMIs will be claimed if arriving within that window. + * Save the PMU state as it needs to be restored when leaving the + * handler */ - if (handled) { - this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; - return handled; + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; } - if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) - return NMI_DONE; + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); - return NMI_HANDLED; + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); } static struct event_constraint * @@ -1256,6 +1376,8 @@ static int __init amd_core_pmu_init(void) x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); } /* -- Gitee From 0638f35375a30640e0ae1565bd49ebf826757fe7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 18 May 2022 14:13:27 +0530 Subject: [PATCH 18/24] perf/x86/amd/core: Fix reloading events for SVM mainline inclusion from mainline-v5.19 commit bae19fdd7e9e759580ac4693d2df3bc23ab415d7 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Commit 1018faa6cf23 ("perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled") addresses an issue in which the Host-Only bit in the counter control registers needs to be masked off when SVM is not enabled. The events need to be reloaded whenever SVM is enabled or disabled for a CPU and this requires the PERF_CTL registers to be reprogrammed using {enable,disable}_all(). However, PerfMonV2 variants of these functions do not reprogram the PERF_CTL registers. Hence, the legacy enable_all() function should also be called. Fixes: 9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control") Reported-by: Like Xu Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220518084327.464005-1-sandipan.das@amd.com --- arch/x86/events/amd/core.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 95b68711f287..c3d1d1c0799e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1462,6 +1462,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1469,8 +1487,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1487,7 +1504,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); -- Gitee From b13b37937cfa6df5c3517f886ca866f06162372b Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:48 -0500 Subject: [PATCH 19/24] perf/x86/amd/ibs: Add bitfield definitions in new header mainline inclusion from mainline-v5.15 commit 6a371bafe613b7746c3d3ac486bdb3035f77e029 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Add with bitfield definitions for IBS MSRs, and demonstrate usage within the driver. Also move 'struct perf_ibs_data' where it can be shared with the perf tool that will soon be using it. No functional changes. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-9-kim.phillips@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 23 +++--- arch/x86/include/asm/amd-ibs.h | 132 +++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 arch/x86/include/asm/amd-ibs.h diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ccc9ee1971e8..9739019d4b67 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -26,6 +26,7 @@ static u32 ibs_caps; #include #include +#include #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -100,15 +101,6 @@ struct perf_ibs { u64 (*get_count)(u64 config); }; -struct perf_ibs_data { - u32 size; - union { - u32 data[0]; /* data buffer starts here */ - u32 caps; - }; - u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; -}; - static int perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { @@ -329,11 +321,14 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, static u64 get_ibs_fetch_count(u64 config) { - return (config & IBS_FETCH_CNT) >> 12; + union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; + + return fetch_ctl.fetch_cnt << 4; } static u64 get_ibs_op_count(u64 config) { + union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; u64 count = 0; /* @@ -341,12 +336,12 @@ static u64 get_ibs_op_count(u64 config) * and the lower 7 bits of CurCnt are randomized. * Otherwise CurCnt has the full 27-bit current counter value. */ - if (config & IBS_OP_VAL) { - count = (config & IBS_OP_MAX_CNT) << 4; + if (op_ctl.op_val) { + count = op_ctl.opmaxcnt << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT) - count += config & IBS_OP_MAX_CNT_EXT_MASK; + count += op_ctl.opmaxcnt_ext << 20; } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { - count = (config & IBS_OP_CUR_CNT) >> 32; + count = op_ctl.opcurcnt; } return count; diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h new file mode 100644 index 000000000000..46e1df45efc0 --- /dev/null +++ b/arch/x86/include/asm/amd-ibs.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * From PPR Vol 1 for AMD Family 19h Model 01h B1 + * 55898 Rev 0.35 - Feb 5, 2021 + */ + +#include + +/* + * IBS Hardware MSRs + */ + +/* MSR 0xc0011030: IBS Fetch Control */ +union ibs_fetch_ctl { + __u64 val; + struct { + __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */ + fetch_cnt:16, /* 16-31: instruction fetch count */ + fetch_lat:16, /* 32-47: instruction fetch latency */ + fetch_en:1, /* 48: instruction fetch enable */ + fetch_val:1, /* 49: instruction fetch valid */ + fetch_comp:1, /* 50: instruction fetch complete */ + ic_miss:1, /* 51: i-cache miss */ + phy_addr_valid:1,/* 52: physical address valid */ + l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size + * (needs IbsPhyAddrValid) */ + l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */ + l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */ + rand_en:1, /* 57: random tagging enable */ + fetch_l2_miss:1,/* 58: L2 miss for sampled fetch + * (needs IbsFetchComp) */ + reserved:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011033: IBS Execution Control */ +union ibs_op_ctl { + __u64 val; + struct { + __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ + reserved0:1, /* 16: reserved */ + op_en:1, /* 17: op sampling enable */ + op_val:1, /* 18: op sample valid */ + cnt_ctl:1, /* 19: periodic op counter control */ + opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ + reserved1:5, /* 27-31: reserved */ + opcurcnt:27, /* 32-58: periodic op counter current count */ + reserved2:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011035: IBS Op Data 2 */ +union ibs_op_data { + __u64 val; + struct { + __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */ + tag_to_ret_ctr:16, /* 15-31: op tag to retire count */ + reserved1:2, /* 32-33: reserved */ + op_return:1, /* 34: return op */ + op_brn_taken:1, /* 35: taken branch op */ + op_brn_misp:1, /* 36: mispredicted branch op */ + op_brn_ret:1, /* 37: branch op retired */ + op_rip_invalid:1, /* 38: RIP is invalid */ + op_brn_fuse:1, /* 39: fused branch op */ + op_microcode:1, /* 40: microcode op */ + reserved2:23; /* 41-63: reserved */ + }; +}; + +/* MSR 0xc0011036: IBS Op Data 2 */ +union ibs_op_data2 { + __u64 val; + struct { + __u64 data_src:3, /* 0-2: data source */ + reserved0:1, /* 3: reserved */ + rmt_node:1, /* 4: destination node */ + cache_hit_st:1, /* 5: cache hit state */ + reserved1:57; /* 5-63: reserved */ + }; +}; + +/* MSR 0xc0011037: IBS Op Data 3 */ +union ibs_op_data3 { + __u64 val; + struct { + __u64 ld_op:1, /* 0: load op */ + st_op:1, /* 1: store op */ + dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */ + dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */ + dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */ + dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */ + dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */ + dc_miss:1, /* 7: data cache miss */ + dc_mis_acc:1, /* 8: misaligned access */ + reserved:4, /* 9-12: reserved */ + dc_wc_mem_acc:1, /* 13: write combining memory access */ + dc_uc_mem_acc:1, /* 14: uncacheable memory access */ + dc_locked_op:1, /* 15: locked operation */ + dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */ + dc_lin_addr_valid:1, /* 17: data cache linear address valid */ + dc_phy_addr_valid:1, /* 18: data cache physical address valid */ + dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */ + l2_miss:1, /* 20: L2 cache miss */ + sw_pf:1, /* 21: software prefetch */ + op_mem_width:4, /* 22-25: load/store size in bytes */ + op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */ + dc_miss_lat:16, /* 32-47: data cache miss latency */ + tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */ + }; +}; + +/* MSR 0xc001103c: IBS Fetch Control Extended */ +union ic_ibs_extd_ctl { + __u64 val; + struct { + __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */ + reserved:48; /* 16-63: reserved */ + }; +}; + +/* + * IBS driver related + */ + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; -- Gitee From 04246811092085092282a06f9d727c8975dcfaf5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 29 Apr 2022 10:44:41 +0530 Subject: [PATCH 20/24] perf/amd/ibs: Use interrupt regs ip for stack unwinding mainline inclusion from mainline-v5.15 commit 3d47083b9ff46863e8374ad3bb5edb5e464c75f8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- IbsOpRip is recorded when IBS interrupt is triggered. But there is a skid from the time IBS interrupt gets triggered to the time the interrupt is presented to the core. Meanwhile processor would have moved ahead and thus IbsOpRip will be inconsistent with rsp and rbp recorded as part of the interrupt regs. This causes issues while unwinding stack using the ORC unwinder as it needs consistent rip, rsp and rbp. Fix this by using rip from interrupt regs instead of IbsOpRip for stack unwinding. Fixes: ee9f8fce99640 ("x86/unwind: Add the ORC unwinder") Reported-by: Dmitry Monakhov Suggested-by: Peter Zijlstra Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220429051441.14251-1-ravi.bangoria@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 9739019d4b67..11e8b493e015 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -304,6 +304,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -687,6 +697,14 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { -- Gitee From e134ca6d02dac2640684ec186fe3e7eaa90d7844 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:07 +0530 Subject: [PATCH 21/24] perf/amd/ibs: Cascade pmu init functions' return value mainline inclusion from mainline-v5.15 commit 39b2ca75eec8a33e2ffdb8aa0c4840ec3e3b472c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- IBS pmu initialization code ignores return value provided by callee functions. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-2-ravi.bangoria@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 11e8b493e015..2704ec1e42a3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -777,9 +777,10 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + int ret; /* * Some chips fail to reset the fetch count when it is written; instead @@ -791,7 +792,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ret) + return ret; if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -804,15 +807,35 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1082,9 +1105,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ -- Gitee From 98418b13e9d646ad293ba41604e4a802c91073b9 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:08 +0530 Subject: [PATCH 22/24] perf/amd/ibs: Use ->is_visible callback for dynamic attributes mainline inclusion from mainline-v5.15 commit 2a7a7e658682bfd7501dc6b4c9d365aa6c79788a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- Currently, some attributes are added at build time whereas others at boot time depending on IBS pmu capabilities. Instead, we can just add all attribute groups at build time but hide individual group at boot time using more appropriate ->is_visible() callback. Also, struct perf_ibs has bunch of fields for pmu attributes which just pass on the pointer, does not do anything else. Remove them. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-3-ravi.bangoria@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 78 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2704ec1e42a3..ece4f6a7d24b 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -94,10 +94,6 @@ struct perf_ibs { unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; @@ -528,16 +524,61 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); -static struct attribute *ibs_fetch_format_attrs[] = { +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, NULL, }; @@ -561,7 +602,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -587,7 +627,6 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; @@ -757,17 +796,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -779,7 +807,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) static __init int perf_event_ibs_init(void) { - struct attribute **attr = ibs_op_format_attrs; int ret; /* @@ -792,14 +819,14 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ret) return ret; - if (ibs_caps & IBS_CAPS_OPCNT) { + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; @@ -807,6 +834,9 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); if (ret) goto err_op; -- Gitee From 2d2cdfed2773e76c476e1c1b8f6914488bf2f6f9 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:09 +0530 Subject: [PATCH 23/24] perf/amd/ibs: Add support for L3 miss filtering mainline inclusion from mainline-v5.15 commit ba5d35b442c65f32d38ef61f732218274c6dcf4c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- IBS L3 miss filtering works by tagging an instruction on IBS counter overflow and generating an NMI if the tagged instruction causes an L3 miss. Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch pmu and 1-127 for op pmu). This helps in reducing sampling overhead when user is interested only in such samples. One of the use case of such filtered samples is to feed data to page-migration daemon in tiered memory systems. Add support for L3 miss filtering in IBS driver via new pmu attribute "l3missonly". Example usage: # perf record -a -e ibs_op/l3missonly=1/ --raw-samples sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-4-ravi.bangoria@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 67 +++++++++++++++++++++++++++---- arch/x86/include/asm/perf_event.h | 3 ++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ece4f6a7d24b..2dc8b7ec030a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -544,22 +544,46 @@ static const struct attribute_group *empty_attr_groups[] = { PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, }; +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, NULL, }; +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + NULL, +}; + static umode_t cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -571,14 +595,26 @@ static struct attribute *cnt_ctl_attrs[] = { NULL, }; +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, .is_visible = cnt_ctl_is_visible, }; +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, + &group_op_l3missonly, NULL, }; @@ -805,10 +841,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - int ret; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -819,12 +853,17 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - if (ret) - return ret; + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} +static __init int perf_ibs_op_init(void) +{ if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -834,10 +873,24 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); if (ret) goto err_op; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 513b66da416b..feb09f963f1e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -405,6 +405,7 @@ struct pebs_xmm { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -418,6 +419,7 @@ struct pebs_xmm { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -434,6 +436,7 @@ struct pebs_xmm { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ -- Gitee From 55e3459d3b52607a9f7f25c406f522fcd93958f5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:10 +0530 Subject: [PATCH 24/24] perf/amd/ibs: Advertise zen4_ibs_extensions as pmu capability attribute mainline inclusion from mainline-v5.15 commit 838de1d843fc9b6161e0e1c6308a8c027d08606d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5S3WV CVE: NA ------------------------------------------------- PMU driver can advertise certain feature via capability attribute('caps' sysfs directory) which can be consumed by userspace tools like perf. Add zen4_ibs_extensions capability attribute for IBS pmus. This attribute will be enabled when CPUID_Fn8000001B_EAX[11] is set. With patch on Zen4: $ ls /sys/bus/event_source/devices/ibs_op/caps zen4_ibs_extensions Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-5-ravi.bangoria@amd.com Signed-off-by: Xie Haocheng --- arch/x86/events/amd/ibs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2dc8b7ec030a..c251bc44c088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -537,8 +537,14 @@ static struct attribute_group empty_format_group = { .attrs = attrs_empty, }; +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + static const struct attribute_group *empty_attr_groups[] = { &empty_format_group, + &empty_caps_group, NULL, }; @@ -546,6 +552,7 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -563,6 +570,11 @@ static struct attribute *fetch_l3missonly_attrs[] = { NULL, }; +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -574,13 +586,21 @@ static struct attribute_group group_fetch_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, + &empty_caps_group, NULL, }; static const struct attribute_group *fetch_attr_update[] = { &group_fetch_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -615,6 +635,7 @@ static struct attribute_group group_op_l3missonly = { static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; -- Gitee