diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7814ff2e45c718440a1db7af51d30c5c2f1509e0..a77a41abe13704a67f2d8f9ad397be568b47f1c8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1222,30 +1222,24 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ -static __always_inline void -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) +static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { - if (static_key_false(&perf_swevent_enabled[event_id])) { - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); + struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); - perf_fetch_caller_regs(regs); - ___perf_sw_event(event_id, nr, regs, addr); - } + perf_fetch_caller_regs(regs); + ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; -static __always_inline bool -perf_sw_migrate_enabled(void) +static __always_inline bool __perf_sw_enabled(int swevt) { - if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS])) - return true; - return false; + return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { - if (perf_sw_migrate_enabled()) + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } @@ -1255,11 +1249,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev, if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); - if (perf_sw_migrate_enabled() && task->sched_migrated) { - struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); - - perf_fetch_caller_regs(regs); - ___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0); + if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && + task->sched_migrated) { + __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } @@ -1267,7 +1259,15 @@ static inline void perf_event_task_sched_in(struct task_struct *prev, static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { - perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); + if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) + __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); + +#ifdef CONFIG_CGROUP_PERF + if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && + perf_cgroup_from_task(prev, NULL) != + perf_cgroup_from_task(next, NULL)) + __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); +#endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); @@ -1535,8 +1535,6 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh) static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void -perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { } -static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dceb9dc9f1130e6ace2742d720f38350b8e4eec6..414ebae7009207224cb3dd1c81a7935426a74130 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -127,6 +127,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1fbda5c87a293d30435ce652eadc0eb9f61b81f8..205c880d9e0148a23f3a59ac4ae077caff25ec26 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -343,9 +343,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = { #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; + u64 cgrp_id; - return cgroup_id(cgrp); + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + cgrp_id = cgroup_id(cgrp); + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { @@ -356,13 +362,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; struct cgroup *ancestor; + u64 cgrp_id; + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); - if (!ancestor) - return 0; - return cgroup_id(ancestor); + cgrp_id = ancestor ? cgroup_id(ancestor) : 0; + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad28507e16dcdb07919db091cb4920eb72cf86ca..ebbb4650bff0be3d10aa75ac99f9a634358583f8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1318,6 +1318,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 802c4a11340055dc8157cbb250cfa20d397f3b32..0e031ab5fbc1c32401add99986b6f8bc0550d129 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -145,6 +145,8 @@ VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ /boot/vmlinux-$(shell uname -r) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +bootstrap: $(BPFTOOL_BOOTSTRAP) + ifneq ($(VMLINUX_BTF)$(VMLINUX_H),) ifeq ($(feature-clang-bpf-co-re),1) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 2b948928dfd6ee2e5714985a5b6f946819a95348..2e8dae24bc0754bb6039d62bf123c6eecd60c741 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -100,7 +100,9 @@ FEATURE_TESTS_EXTRA := \ clang \ libbpf \ libpfm4 \ - libdebuginfod + libdebuginfod \ + clang-bpf-co-re + FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 8611aa6e248305056e00b48a4191cbd7c4866f44..03923346cc0534fff4a6aac4077d501591a290d2 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -112,6 +112,7 @@ enum perf_sw_ids { PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, PERF_COUNT_SW_MAX, /* non-ABI */ }; diff --git a/tools/lib/perf/include/perf/bpf_perf.h b/tools/lib/perf/include/perf/bpf_perf.h new file mode 100644 index 0000000000000000000000000000000000000000..64c8d211726d7f6831dffedcdd8025536a201322 --- /dev/null +++ b/tools/lib/perf/include/perf/bpf_perf.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBPERF_BPF_PERF_H +#define __LIBPERF_BPF_PERF_H + +#include /* for __u32 */ + +/* + * bpf_perf uses a hashmap, the attr_map, to track all the leader programs. + * The hashmap is pinned in bpffs. flock() on this file is used to ensure + * no concurrent access to the attr_map. The key of attr_map is struct + * perf_event_attr, and the value is struct perf_event_attr_map_entry. + * + * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the + * leader prog, and the diff_map. Each perf-stat session holds a reference + * to the bpf_link to make sure the leader prog is attached to sched_switch + * tracepoint. + * + * Since the hashmap only contains IDs of the bpf_link and diff_map, it + * does not hold any references to the leader program. Once all perf-stat + * sessions of these events exit, the leader prog, its maps, and the + * perf_events will be freed. + */ +struct perf_event_attr_map_entry { + __u32 link_id; + __u32 diff_map_id; + __u8 supported; +}; + +/* default attr_map name */ +#define BPF_PERF_DEFAULT_ATTR_MAP_PATH "perf_attr_map" + +#endif /* __LIBPERF_BPF_PERF_H */ diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 4c7db1da8fccc29364a54b43542bed50d05bda0a..a4881d32b318a9a15f17cb04044ded6f28ca0b6d 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -59,6 +59,7 @@ counted. The following modifiers exist: D - pin the event to the PMU W - group is weak and will fallback to non-group if not schedulable, e - group or event are exclusive and do not share the PMU + b - use BPF aggregration (see perf stat --bpf-counters) The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index f9bcd95bf35237df4360c82953da07888f3b0085..c2ce57aab63e625e606240cd35701efddfdc70ef 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -75,6 +75,37 @@ report:: --tid=:: stat events on existing thread id (comma separated list) +-b:: +--bpf-prog:: + stat events on existing bpf program id (comma separated list), + requiring root rights. bpftool-prog could be used to find program + id all bpf programs in the system. For example: + + # bpftool prog | head -n 1 + 17247: tracepoint name sys_enter tag 192d548b9d754067 gpl + + # perf stat -e cycles,instructions --bpf-prog 17247 --timeout 1000 + + Performance counter stats for 'BPF program(s) 17247': + + 85,967 cycles + 28,982 instructions # 0.34 insn per cycle + + 1.102235068 seconds time elapsed + +--bpf-counters:: + Use BPF programs to aggregate readings from perf_events. This + allows multiple perf-stat sessions that are counting the same metric (cycles, + instructions, etc.) to share hardware counters. + To use BPF programs on common events by default, use + "perf config stat.bpf-counter-events=". + +--bpf-attr-map:: + With option "--bpf-counters", different perf-stat sessions share + information about shared BPF programs and maps via a pinned hashmap. + Use "--bpf-attr-map" to specify the path of this pinned hashmap. + The default path is /sys/fs/bpf/perf_attr_map. + ifdef::HAVE_LIBPFM[] --pfm-events events:: Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index a93b4b55fd2fcac0f640a3aa57f3a2b102259c33..e4e3fe440f8c0c8470beeff8abd7b2b20f788f48 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -659,6 +659,15 @@ ifndef NO_LIBBPF endif endif +ifdef BUILD_BPF_SKEL + $(call feature_check,clang-bpf-co-re) + ifeq ($(feature-clang-bpf-co-re), 0) + dummy := $(error Error: clang too old. Please install recent clang) + endif + $(call detected,CONFIG_PERF_BPF_SKEL) + CFLAGS += -DHAVE_BPF_SKEL +endif + dwarf-post-unwind := 1 dwarf-post-unwind-text := BUG diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e41a8f9b99d2d2d8cf53b8e82e269e9734bc2c78..b55674205c0769ab392b41612dd1446b3da26081 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -126,6 +126,8 @@ include ../scripts/utilities.mak # # Define NO_LIBDEBUGINFOD if you do not want support debuginfod # +# Define BUILD_BPF_SKEL to enable BPF skeletons +# # As per kernel Makefile, avoid funny character set dependencies unexport LC_ALL @@ -175,6 +177,12 @@ endef LD += $(EXTRA_LDFLAGS) +HOSTCC ?= gcc +HOSTLD ?= ld +HOSTAR ?= ar +CLANG ?= clang +LLVM_STRIP ?= llvm-strip + PKG_CONFIG = $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config @@ -731,7 +739,8 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc $(x86_arch_prctl_code_array) \ $(rename_flags_array) \ $(arch_errno_name_array) \ - $(sync_file_range_arrays) + $(sync_file_range_arrays) \ + bpf-skel $(OUTPUT)%.o: %.c prepare FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ @@ -1004,7 +1013,59 @@ config-clean: python-clean: $(python-clean) -clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean config-clean fixdep-clean python-clean +SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel) +SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp) +SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h +SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h +SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h + +ifdef BUILD_BPF_SKEL +BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool +LIBBPF_SRC := $(abspath ../lib/bpf) +BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(BPF_PATH) -I$(LIBBPF_SRC)/.. + +$(SKEL_TMP_OUT): + $(Q)$(MKDIR) -p $@ + +$(BPFTOOL): | $(SKEL_TMP_OUT) + CFLAGS= $(MAKE) -C ../bpf/bpftool \ + OUTPUT=$(SKEL_TMP_OUT)/ bootstrap + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +$(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +ifeq ($(VMLINUX_H),) + $(QUIET_GEN)$(BPFTOOL) btf dump file $< format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h | $(SKEL_TMP_OUT) + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf $(BPF_INCLUDE) \ + -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ + +$(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL) + $(QUIET_GENSKEL)$(BPFTOOL) gen skeleton $< > $@ + +bpf-skel: $(SKELETONS) + +.PRECIOUS: $(SKEL_TMP_OUT)/%.bpf.o + +else # BUILD_BPF_SKEL + +bpf-skel: + +endif # BUILD_BPF_SKEL + +bpf-skel-clean: + $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) + +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean config-clean fixdep-clean python-clean bpf-skel-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a5dafba23d93569d2d3f549de09e047cbeca2736..babe763c38372534233477480bb9f61c6e0d186c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -67,6 +67,7 @@ #include "util/top.h" #include "util/affinity.h" #include "util/pfm.h" +#include "util/bpf_counter.h" #include "asm/bug.h" #include @@ -146,6 +147,7 @@ static const char *smi_cost_attrs = { }; static struct evlist *evsel_list; +static bool all_counters_use_bpf = true; static struct target target = { .uid = UINT_MAX, @@ -385,6 +387,9 @@ static int read_affinity_counters(struct timespec *rs) struct affinity affinity; int i, ncpus, cpu; + if (all_counters_use_bpf) + return 0; + if (affinity__setup(&affinity) < 0) return -1; @@ -399,6 +404,8 @@ static int read_affinity_counters(struct timespec *rs) evlist__for_each_entry(evsel_list, counter) { if (evsel__cpu_iter_skip(counter, cpu)) continue; + if (evsel__is_bpf(counter)) + continue; if (!counter->err) { counter->err = read_counter_cpu(counter, rs, counter->cpu_iter - 1); @@ -409,12 +416,31 @@ static int read_affinity_counters(struct timespec *rs) return 0; } +static int read_bpf_map_counters(void) +{ + struct evsel *counter; + int err; + + evlist__for_each_entry(evsel_list, counter) { + if (!evsel__is_bpf(counter)) + continue; + + err = bpf_counter__read(counter); + if (err) + return err; + } + return 0; +} + static void read_counters(struct timespec *rs) { struct evsel *counter; - if (!stat_config.stop_read_counter && (read_affinity_counters(rs) < 0)) - return; + if (!stat_config.stop_read_counter) { + if (read_bpf_map_counters() || + read_affinity_counters(rs)) + return; + } evlist__for_each_entry(evsel_list, counter) { if (counter->err) @@ -496,11 +522,23 @@ static bool handle_interval(unsigned int interval, int *times) return false; } -static void enable_counters(void) +static int enable_counters(void) { + struct evsel *evsel; + int err; + + evlist__for_each_entry(evsel_list, evsel) { + if (!evsel__is_bpf(evsel)) + continue; + + err = bpf_counter__enable(evsel); + if (err) + return err; + } + if (stat_config.initial_delay < 0) { pr_info(EVLIST_DISABLED_MSG); - return; + return 0; } if (stat_config.initial_delay > 0) { @@ -518,6 +556,7 @@ static void enable_counters(void) if (stat_config.initial_delay > 0) pr_info(EVLIST_ENABLED_MSG); } + return 0; } static void disable_counters(void) @@ -720,7 +759,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) const bool forks = (argc > 0); bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; struct affinity affinity; - int i, cpu; + int i, cpu, err; bool second_pass = false; if (forks) { @@ -738,7 +777,20 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (affinity__setup(&affinity) < 0) return -1; + evlist__for_each_entry(evsel_list, counter) { + if (bpf_counter__load(counter, &target)) + return -1; + if (!(evsel__is_bperf(counter))) + all_counters_use_bpf = false; + } + evlist__for_each_cpu (evsel_list, i, cpu) { + /* + * bperf calls evsel__open_per_cpu() in bperf__load(), so + * no need to call it again here. + */ + if (target.use_bpf) + break; affinity__set(&affinity, cpu); evlist__for_each_entry(evsel_list, counter) { @@ -746,6 +798,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) continue; if (counter->reset_group || counter->errored) continue; + if (evsel__is_bperf(counter)) + continue; try_again: if (create_perf_stat_counter(counter, &stat_config, &target, counter->cpu_iter - 1) < 0) { @@ -853,7 +907,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) } if (STAT_RECORD) { - int err, fd = perf_data__fd(&perf_stat.data); + int fd = perf_data__fd(&perf_stat.data); if (is_pipe) { err = perf_header__write_pipe(perf_data__fd(&perf_stat.data)); @@ -874,12 +928,14 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) /* * Enable counters and exec the command: */ - t0 = rdclock(); - clock_gettime(CLOCK_MONOTONIC, &ref_time); - if (forks) { perf_evlist__start_workload(evsel_list); - enable_counters(); + err = enable_counters(); + if (err) + return -1; + + t0 = rdclock(); + clock_gettime(CLOCK_MONOTONIC, &ref_time); if (interval || timeout || evlist__ctlfd_initialized(evsel_list)) status = dispatch_events(forks, timeout, interval, ×); @@ -898,7 +954,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (WIFSIGNALED(status)) psignal(WTERMSIG(status), argv[0]); } else { - enable_counters(); + err = enable_counters(); + if (err) + return -1; + + t0 = rdclock(); + clock_gettime(CLOCK_MONOTONIC, &ref_time); + status = dispatch_events(forks, timeout, interval, ×); } @@ -1089,6 +1151,14 @@ static struct option stat_options[] = { "stat events on existing process id"), OPT_STRING('t', "tid", &target.tid, "tid", "stat events on existing thread id"), +#ifdef HAVE_BPF_SKEL + OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id", + "stat events on existing bpf program id"), + OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf, + "use bpf program to count events"), + OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path", + "path to perf_event_attr map"), +#endif OPT_BOOLEAN('a', "all-cpus", &target.system_wide, "system-wide collection from all CPUs"), OPT_BOOLEAN('g', "group", &group, @@ -2060,11 +2130,12 @@ int cmd_stat(int argc, const char **argv) "perf stat [] []", NULL }; - int status = -EINVAL, run_idx; + int status = -EINVAL, run_idx, err; const char *mode; FILE *output = stderr; unsigned int interval, timeout; const char * const stat_subcommands[] = { "record", "report" }; + char errbuf[BUFSIZ]; setlocale(LC_ALL, ""); @@ -2175,6 +2246,13 @@ int cmd_stat(int argc, const char **argv) } else if (big_num_opt == 0) /* User passed --no-big-num */ stat_config.big_num = false; + target.inherit = !stat_config.no_inherit; + err = target__validate(&target); + if (err) { + target__strerror(&target, err, errbuf, BUFSIZ); + pr_warning("%s\n", errbuf); + } + setup_system_wide(argc); /* @@ -2245,7 +2323,6 @@ int cmd_stat(int argc, const char **argv) goto out; } - target__validate(&target); if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide)) target.per_thread = true; @@ -2377,9 +2454,10 @@ int cmd_stat(int argc, const char **argv) * tools remain -acme */ int fd = perf_data__fd(&perf_stat.data); - int err = perf_event__synthesize_kernel_mmap((void *)&perf_stat, - process_synthesized_event, - &perf_stat.session->machines.host); + + err = perf_event__synthesize_kernel_mmap((void *)&perf_stat, + process_synthesized_event, + &perf_stat.session->machines.host); if (err) { pr_warning("Couldn't synthesize the kernel mmap record, harmless, " "older tools may produce warnings about this file\n."); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5dc7942a959d55da86865c0699817e2f2d61ac5f..dc96e48ae3da553894299272322bda90085a04ce 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -17,7 +17,9 @@ #include "util/record.h" #include #include +#ifdef HAVE_LIBBPF_SUPPORT #include +#endif #include "util/bpf_map.h" #include "util/rlimit.h" #include "builtin.h" diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh new file mode 100755 index 0000000000000000000000000000000000000000..984d28388a29921b988f58c25ed667fdc0a9bb4e --- /dev/null +++ b/tools/perf/tests/shell/stat_bpf_counters.sh @@ -0,0 +1,76 @@ +#!/bin/sh +# perf stat --bpf-counters test +# SPDX-License-Identifier: GPL-2.0 + +set -e + +workload="perf test -w sqrtloop" + +# check whether $2 is within +/- 10% of $1 +compare_number() +{ + first_num=$1 + second_num=$2 + + # upper bound is first_num * 120% + upper=$(expr $first_num + $first_num / 5 ) + # lower bound is first_num * 80% + lower=$(expr $first_num - $first_num / 5 ) + + if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then + echo "The difference between $first_num and $second_num are greater than 20%." + exit 1 + fi +} + +check_counts() +{ + base_instructions=$1 + bpf_instructions=$2 + + if [ "$base_instructions" = "&1 | awk '/instructions/ {print $1}') + bpf_instructions=$(perf stat --no-big-num --bpf-counters -e instructions -- $workload 2>&1 | awk '/instructions/ {print $1}') + check_counts $base_instructions $bpf_instructions + compare_number $base_instructions $bpf_instructions + echo "[Success]" +} + +test_bpf_modifier() +{ + printf "Testing bpf event modifier " + stat_output=$(perf stat --no-big-num -e instructions/name=base_instructions/,instructions/name=bpf_instructions/b -- $workload 2>&1) + base_instructions=$(echo "$stat_output"| awk '/base_instructions/ {print $1}') + bpf_instructions=$(echo "$stat_output"| awk '/bpf_instructions/ {print $1}') + check_counts $base_instructions $bpf_instructions + compare_number $base_instructions $bpf_instructions + echo "[Success]" +} + +# skip if --bpf-counters is not supported +if ! perf stat -e instructionsi -e cycles --bpf-counters true > /dev/null 2>&1; then + if [ "$1" = "-v" ]; then + echo "Skipping: --bpf-counters not supported" + perf --no-pager stat -e instructions -e cycles--bpf-counters true || true + fi + exit 2 +fi + + + +test_bpf_counters +test_bpf_modifier + +exit 0 diff --git a/tools/perf/util/Build b/tools/perf/util/Build index fe253e003d72ba2964c1f3841692a4e8db2c7e3c..8d63595e6f56c1f004a4f346260f92003c626acf 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -139,6 +139,8 @@ perf-y += clockid.o perf-$(CONFIG_LIBBPF) += bpf-loader.o perf-$(CONFIG_LIBBPF) += bpf_map.o +perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o +perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o perf-$(CONFIG_LIBELF) += symbol-elf.o perf-$(CONFIG_LIBELF) += probe-file.o diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c new file mode 100644 index 0000000000000000000000000000000000000000..0a4477d493aa63b9cd62c11cf8289a0924b8bfb2 --- /dev/null +++ b/tools/perf/util/bpf_counter.c @@ -0,0 +1,833 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_counter.h" +#include "counts.h" +#include "debug.h" +#include "evsel.h" +#include "evlist.h" +#include "target.h" +#include "cgroup.h" +#include "cpumap.h" +#include "thread_map.h" + +#include "bpf_skel/bpf_prog_profiler.skel.h" +#include "bpf_skel/bperf_u.h" +#include "bpf_skel/bperf_leader.skel.h" +#include "bpf_skel/bperf_follower.skel.h" + +#define ATTR_MAP_SIZE 100 + +static inline void *u64_to_ptr(__u64 ptr) +{ + return (void *)(unsigned long)ptr; +} + +static struct bpf_counter *bpf_counter_alloc(void) +{ + struct bpf_counter *counter; + + counter = zalloc(sizeof(*counter)); + if (counter) + INIT_LIST_HEAD(&counter->list); + return counter; +} + +static int bpf_program_profiler__destroy(struct evsel *evsel) +{ + struct bpf_counter *counter, *tmp; + + list_for_each_entry_safe(counter, tmp, + &evsel->bpf_counter_list, list) { + list_del_init(&counter->list); + bpf_prog_profiler_bpf__destroy(counter->skel); + free(counter); + } + assert(list_empty(&evsel->bpf_counter_list)); + + return 0; +} + +static char *bpf_target_prog_name(int tgt_fd) +{ + struct bpf_prog_info_linear *info_linear; + struct bpf_func_info *func_info; + const struct btf_type *t; + char *name = NULL; + struct btf *btf; + + info_linear = bpf_program__get_prog_info_linear( + tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO); + if (IS_ERR_OR_NULL(info_linear)) { + pr_debug("failed to get info_linear for prog FD %d\n", tgt_fd); + return NULL; + } + + if (info_linear->info.btf_id == 0 || + btf__get_from_id(info_linear->info.btf_id, &btf)) { + pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd); + goto out; + } + + func_info = u64_to_ptr(info_linear->info.func_info); + t = btf__type_by_id(btf, func_info[0].type_id); + if (!t) { + pr_debug("btf %d doesn't have type %d\n", + info_linear->info.btf_id, func_info[0].type_id); + goto out; + } + name = strdup(btf__name_by_offset(btf, t->name_off)); +out: + free(info_linear); + return name; +} + +static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id) +{ + struct bpf_prog_profiler_bpf *skel; + struct bpf_counter *counter; + struct bpf_program *prog; + char *prog_name; + int prog_fd; + int err; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + pr_err("Failed to open fd for bpf prog %u\n", prog_id); + return -1; + } + counter = bpf_counter_alloc(); + if (!counter) { + close(prog_fd); + return -1; + } + + skel = bpf_prog_profiler_bpf__open(); + if (!skel) { + pr_err("Failed to open bpf skeleton\n"); + goto err_out; + } + + skel->rodata->num_cpu = evsel__nr_cpus(evsel); + + bpf_map__resize(skel->maps.events, evsel__nr_cpus(evsel)); + bpf_map__resize(skel->maps.fentry_readings, 1); + bpf_map__resize(skel->maps.accum_readings, 1); + + prog_name = bpf_target_prog_name(prog_fd); + if (!prog_name) { + pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n", prog_id); + goto err_out; + } + + bpf_object__for_each_program(prog, skel->obj) { + err = bpf_program__set_attach_target(prog, prog_fd, prog_name); + if (err) { + pr_err("bpf_program__set_attach_target failed.\n" + "Does bpf prog %u have BTF?\n", prog_id); + goto err_out; + } + } + set_max_rlimit(); + err = bpf_prog_profiler_bpf__load(skel); + if (err) { + pr_err("bpf_prog_profiler_bpf__load failed\n"); + goto err_out; + } + + assert(skel != NULL); + counter->skel = skel; + list_add(&counter->list, &evsel->bpf_counter_list); + close(prog_fd); + return 0; +err_out: + bpf_prog_profiler_bpf__destroy(skel); + free(counter); + close(prog_fd); + return -1; +} + +static int bpf_program_profiler__load(struct evsel *evsel, struct target *target) +{ + char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p; + u32 prog_id; + int ret; + + bpf_str_ = bpf_str = strdup(target->bpf_str); + if (!bpf_str) + return -1; + + while ((tok = strtok_r(bpf_str, ",", &saveptr)) != NULL) { + prog_id = strtoul(tok, &p, 10); + if (prog_id == 0 || prog_id == UINT_MAX || + (*p != '\0' && *p != ',')) { + pr_err("Failed to parse bpf prog ids %s\n", + target->bpf_str); + return -1; + } + + ret = bpf_program_profiler_load_one(evsel, prog_id); + if (ret) { + bpf_program_profiler__destroy(evsel); + free(bpf_str_); + return -1; + } + bpf_str = NULL; + } + free(bpf_str_); + return 0; +} + +static int bpf_program_profiler__enable(struct evsel *evsel) +{ + struct bpf_counter *counter; + int ret; + + list_for_each_entry(counter, &evsel->bpf_counter_list, list) { + assert(counter->skel != NULL); + ret = bpf_prog_profiler_bpf__attach(counter->skel); + if (ret) { + bpf_program_profiler__destroy(evsel); + return ret; + } + } + return 0; +} + +static int bpf_program_profiler__disable(struct evsel *evsel) +{ + struct bpf_counter *counter; + + list_for_each_entry(counter, &evsel->bpf_counter_list, list) { + assert(counter->skel != NULL); + bpf_prog_profiler_bpf__detach(counter->skel); + } + return 0; +} + +static int bpf_program_profiler__read(struct evsel *evsel) +{ + // perf_cpu_map uses /sys/devices/system/cpu/online + int num_cpu = evsel__nr_cpus(evsel); + // BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible + // Sometimes possible > online, like on a Ryzen 3900X that has 24 + // threads but its possible showed 0-31 -acme + int num_cpu_bpf = libbpf_num_possible_cpus(); + struct bpf_perf_event_value values[num_cpu_bpf]; + struct bpf_counter *counter; + int reading_map_fd; + __u32 key = 0; + int err, cpu; + + if (list_empty(&evsel->bpf_counter_list)) + return -EAGAIN; + + for (cpu = 0; cpu < num_cpu; cpu++) { + perf_counts(evsel->counts, cpu, 0)->val = 0; + perf_counts(evsel->counts, cpu, 0)->ena = 0; + perf_counts(evsel->counts, cpu, 0)->run = 0; + } + list_for_each_entry(counter, &evsel->bpf_counter_list, list) { + struct bpf_prog_profiler_bpf *skel = counter->skel; + + assert(skel != NULL); + reading_map_fd = bpf_map__fd(skel->maps.accum_readings); + + err = bpf_map_lookup_elem(reading_map_fd, &key, values); + if (err) { + pr_err("failed to read value\n"); + return err; + } + + for (cpu = 0; cpu < num_cpu; cpu++) { + perf_counts(evsel->counts, cpu, 0)->val += values[cpu].counter; + perf_counts(evsel->counts, cpu, 0)->ena += values[cpu].enabled; + perf_counts(evsel->counts, cpu, 0)->run += values[cpu].running; + } + } + return 0; +} + +static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu, + int fd) +{ + struct bpf_prog_profiler_bpf *skel; + struct bpf_counter *counter; + int ret; + + list_for_each_entry(counter, &evsel->bpf_counter_list, list) { + skel = counter->skel; + assert(skel != NULL); + + ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events), + &cpu, &fd, BPF_ANY); + if (ret) + return ret; + } + return 0; +} + +struct bpf_counter_ops bpf_program_profiler_ops = { + .load = bpf_program_profiler__load, + .enable = bpf_program_profiler__enable, + .disable = bpf_program_profiler__disable, + .read = bpf_program_profiler__read, + .destroy = bpf_program_profiler__destroy, + .install_pe = bpf_program_profiler__install_pe, +}; + +static bool bperf_attr_map_compatible(int attr_map_fd) +{ + struct bpf_map_info map_info = {0}; + __u32 map_info_len = sizeof(map_info); + int err; + + err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len); + + if (err) + return false; + return (map_info.key_size == sizeof(struct perf_event_attr)) && + (map_info.value_size == sizeof(struct perf_event_attr_map_entry)); +} + +static int bperf_lock_attr_map(struct target *target) +{ + char path[PATH_MAX]; + int map_fd, err; + + if (target->attr_map) { + scnprintf(path, PATH_MAX, "%s", target->attr_map); + } else { + scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(), + BPF_PERF_DEFAULT_ATTR_MAP_PATH); + } + + if (access(path, F_OK)) { + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(struct perf_event_attr), + sizeof(struct perf_event_attr_map_entry), + ATTR_MAP_SIZE, 0); + if (map_fd < 0) + return -1; + + err = bpf_obj_pin(map_fd, path); + if (err) { + /* someone pinned the map in parallel? */ + close(map_fd); + map_fd = bpf_obj_get(path); + if (map_fd < 0) + return -1; + } + } else { + map_fd = bpf_obj_get(path); + if (map_fd < 0) + return -1; + } + + if (!bperf_attr_map_compatible(map_fd)) { + close(map_fd); + return -1; + + } + err = flock(map_fd, LOCK_EX); + if (err) { + close(map_fd); + return -1; + } + return map_fd; +} + +static int bperf_check_target(struct evsel *evsel, + struct target *target, + enum bperf_filter_type *filter_type, + __u32 *filter_entry_cnt) +{ + if (evsel->leader->core.nr_members > 1) { + pr_err("bpf managed perf events do not yet support groups.\n"); + return -1; + } + + /* determine filter type based on target */ + if (target->system_wide) { + *filter_type = BPERF_FILTER_GLOBAL; + *filter_entry_cnt = 1; + } else if (target->cpu_list) { + *filter_type = BPERF_FILTER_CPU; + *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel)); + } else if (target->tid) { + *filter_type = BPERF_FILTER_PID; + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); + } else if (target->pid || evsel->evlist->workload.pid != -1) { + *filter_type = BPERF_FILTER_TGID; + *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); + } else { + pr_err("bpf managed perf events do not yet support these targets.\n"); + return -1; + } + + return 0; +} + +static struct perf_cpu_map *all_cpu_map; +static __u32 filter_entry_cnt; + +static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, + struct perf_event_attr_map_entry *entry) +{ + struct bperf_leader_bpf *skel = bperf_leader_bpf__open(); + int link_fd, diff_map_fd, err; + struct bpf_link *link = NULL; + + if (!skel) { + pr_err("Failed to open leader skeleton\n"); + return -1; + } + + bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus()); + err = bperf_leader_bpf__load(skel); + if (err) { + pr_err("Failed to load leader skeleton\n"); + goto out; + } + + link = bpf_program__attach(skel->progs.on_switch); + if (IS_ERR(link)) { + pr_err("Failed to attach leader program\n"); + err = PTR_ERR(link); + goto out; + } + + link_fd = bpf_link__fd(link); + diff_map_fd = bpf_map__fd(skel->maps.diff_readings); + entry->link_id = bpf_link_get_id(link_fd); + entry->diff_map_id = bpf_map_get_id(diff_map_fd); + + /* + * save leader_skel for install_pe, which is called within + * following evsel__open_per_cpu call + */ + evsel->leader_skel = skel; + if (!evsel__open_per_cpu(evsel, all_cpu_map, -1)) + entry->supported = true; + + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY); + assert(err == 0); + + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id); + assert(evsel->bperf_leader_link_fd >= 0); + +out: + bperf_leader_bpf__destroy(skel); + bpf_link__destroy(link); + return err; +} + +static int bperf_attach_follower_program(struct bperf_follower_bpf *skel, + enum bperf_filter_type filter_type, + bool inherit) +{ + struct bpf_link *link; + int err = 0; + + if ((filter_type == BPERF_FILTER_PID || + filter_type == BPERF_FILTER_TGID) && inherit) + /* attach all follower bpf progs to enable event inheritance */ + err = bperf_follower_bpf__attach(skel); + else { + link = bpf_program__attach(skel->progs.fexit_XXX); + if (IS_ERR(link)) + err = PTR_ERR(link); + } + + return err; +} + +static int bperf__load(struct evsel *evsel, struct target *target) +{ + struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff, false}; + int attr_map_fd, diff_map_fd = -1, err; + enum bperf_filter_type filter_type; + __u32 i; + + if (evsel->evlist->core.nr_entries > ATTR_MAP_SIZE) { + pr_err("Too many events, please limit to %d or less\n", + ATTR_MAP_SIZE); + return -1; + } + + if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt)) + return -1; + + if (!all_cpu_map) { + all_cpu_map = perf_cpu_map__new(NULL); + if (!all_cpu_map) + return -1; + } + + evsel->bperf_leader_prog_fd = -1; + evsel->bperf_leader_link_fd = -1; + + /* + * Step 1: hold a fd on the leader program and the bpf_link, if + * the program is not already gone, reload the program. + * Use flock() to ensure exclusive access to the perf_event_attr + * map. + */ + attr_map_fd = bperf_lock_attr_map(target); + if (attr_map_fd < 0) { + pr_err("Failed to lock perf_event_attr map\n"); + return -1; + } + + err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry); + if (err) { + err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY); + if (err) + goto out; + } + + evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); + if (evsel->bperf_leader_link_fd < 0 && + bperf_reload_leader_program(evsel, attr_map_fd, &entry)) { + goto out; + } + evsel->supported = entry.supported; + + /* + * The bpf_link holds reference to the leader program, and the + * leader program holds reference to the maps. Therefore, if + * link_id is valid, diff_map_id should also be valid. + */ + evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id( + bpf_link_get_prog_id(evsel->bperf_leader_link_fd)); + assert(evsel->bperf_leader_prog_fd >= 0); + + diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id); + assert(diff_map_fd >= 0); + + /* + * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check + * whether the kernel support it + */ + err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0); + if (err) { + pr_err("The kernel does not support test_run for raw_tp BPF programs.\n" + "Therefore, --use-bpf might show inaccurate readings\n"); + goto out; + } + + /* Step 2: load the follower skeleton */ + evsel->follower_skel = bperf_follower_bpf__open(); + if (!evsel->follower_skel) { + err = -1; + pr_err("Failed to open follower skeleton\n"); + goto out; + } + + /* attach fexit program to the leader program */ + bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX, + evsel->bperf_leader_prog_fd, "on_switch"); + + /* connect to leader diff_reading map */ + bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd); + + /* set up reading map */ + bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, + filter_entry_cnt); + err = bperf_follower_bpf__load(evsel->follower_skel); + if (err) { + pr_err("Failed to load follower skeleton\n"); + bperf_follower_bpf__destroy(evsel->follower_skel); + evsel->follower_skel = NULL; + goto out; + } + + for (i = 0; i < filter_entry_cnt; i++) { + int filter_map_fd; + __u32 key; + struct bperf_filter_value fval = { i, 0 }; + + if (filter_type == BPERF_FILTER_PID || + filter_type == BPERF_FILTER_TGID) + key = evsel->core.threads->map[i].pid; + else if (filter_type == BPERF_FILTER_CPU) + key = evsel->core.cpus->map[i]; + else + break; + + filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); + bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY); + } + + evsel->follower_skel->bss->type = filter_type; + evsel->follower_skel->bss->inherit = target->inherit; + + err = bperf_attach_follower_program(evsel->follower_skel, filter_type, + target->inherit); + +out: + if (err && evsel->bperf_leader_link_fd >= 0) + close(evsel->bperf_leader_link_fd); + if (err && evsel->bperf_leader_prog_fd >= 0) + close(evsel->bperf_leader_prog_fd); + if (diff_map_fd >= 0) + close(diff_map_fd); + + flock(attr_map_fd, LOCK_UN); + close(attr_map_fd); + + return err; +} + +static int bperf__install_pe(struct evsel *evsel, int cpu, int fd) +{ + struct bperf_leader_bpf *skel = evsel->leader_skel; + + return bpf_map_update_elem(bpf_map__fd(skel->maps.events), + &cpu, &fd, BPF_ANY); +} + +/* + * trigger the leader prog on each cpu, so the accum_reading map could get + * the latest readings. + */ +static int bperf_sync_counters(struct evsel *evsel) +{ + int num_cpu, i, cpu; + + num_cpu = all_cpu_map->nr; + for (i = 0; i < num_cpu; i++) { + cpu = all_cpu_map->map[i]; + bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu); + } + return 0; +} + +static int bperf__enable(struct evsel *evsel) +{ + evsel->follower_skel->bss->enabled = 1; + return 0; +} + +static int bperf__disable(struct evsel *evsel) +{ + evsel->follower_skel->bss->enabled = 0; + return 0; +} + +static int bperf__read(struct evsel *evsel) +{ + struct bperf_follower_bpf *skel = evsel->follower_skel; + __u32 num_cpu_bpf = cpu__max_cpu(); + struct bpf_perf_event_value values[num_cpu_bpf]; + int reading_map_fd, err = 0; + __u32 i, j, num_cpu; + + bperf_sync_counters(evsel); + reading_map_fd = bpf_map__fd(skel->maps.accum_readings); + + for (i = 0; i < filter_entry_cnt; i++) { + __u32 cpu; + + err = bpf_map_lookup_elem(reading_map_fd, &i, values); + if (err) + goto out; + switch (evsel->follower_skel->bss->type) { + case BPERF_FILTER_GLOBAL: + assert(i == 0); + + num_cpu = all_cpu_map->nr; + for (j = 0; j < num_cpu; j++) { + cpu = all_cpu_map->map[j]; + perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter; + perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled; + perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running; + } + break; + case BPERF_FILTER_CPU: + cpu = evsel->core.cpus->map[i]; + perf_counts(evsel->counts, i, 0)->val = values[cpu].counter; + perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled; + perf_counts(evsel->counts, i, 0)->run = values[cpu].running; + break; + case BPERF_FILTER_PID: + case BPERF_FILTER_TGID: + perf_counts(evsel->counts, 0, i)->val = 0; + perf_counts(evsel->counts, 0, i)->ena = 0; + perf_counts(evsel->counts, 0, i)->run = 0; + + for (cpu = 0; cpu < num_cpu_bpf; cpu++) { + perf_counts(evsel->counts, 0, i)->val += values[cpu].counter; + perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled; + perf_counts(evsel->counts, 0, i)->run += values[cpu].running; + } + break; + default: + break; + } + } +out: + return err; +} + +static int bperf__destroy(struct evsel *evsel) +{ + bperf_follower_bpf__destroy(evsel->follower_skel); + close(evsel->bperf_leader_prog_fd); + close(evsel->bperf_leader_link_fd); + return 0; +} + +/* + * bperf: share hardware PMCs with BPF + * + * perf uses performance monitoring counters (PMC) to monitor system + * performance. The PMCs are limited hardware resources. For example, + * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. + * + * Modern data center systems use these PMCs in many different ways: + * system level monitoring, (maybe nested) container level monitoring, per + * process monitoring, profiling (in sample mode), etc. In some cases, + * there are more active perf_events than available hardware PMCs. To allow + * all perf_events to have a chance to run, it is necessary to do expensive + * time multiplexing of events. + * + * On the other hand, many monitoring tools count the common metrics + * (cycles, instructions). It is a waste to have multiple tools create + * multiple perf_events of "cycles" and occupy multiple PMCs. + * + * bperf tries to reduce such wastes by allowing multiple perf_events of + * "cycles" or "instructions" (at different scopes) to share PMUs. Instead + * of having each perf-stat session to read its own perf_events, bperf uses + * BPF programs to read the perf_events and aggregate readings to BPF maps. + * Then, the perf-stat session(s) reads the values from these BPF maps. + * + * || + * shared progs and maps <- || -> per session progs and maps + * || + * --------------- || + * | perf_events | || + * --------------- fexit || ----------------- + * | --------||----> | follower prog | + * --------------- / || --- ----------------- + * cs -> | leader prog |/ ||/ | | + * --> --------------- /|| -------------- ------------------ + * / | | / || | filter map | | accum_readings | + * / ------------ ------------ || -------------- ------------------ + * | | prev map | | diff map | || | + * | ------------ ------------ || | + * \ || | + * = \ ==================================================== | ============ + * \ / user space + * \ / + * \ / + * BPF_PROG_TEST_RUN BPF_MAP_LOOKUP_ELEM + * \ / + * \ / + * \------ perf-stat ----------------------/ + * + * The figure above shows the architecture of bperf. Note that the figure + * is divided into 3 regions: shared progs and maps (top left), per session + * progs and maps (top right), and user space (bottom). + * + * The leader prog is triggered on each context switch (cs). The leader + * prog reads perf_events and stores the difference (current_reading - + * previous_reading) to the diff map. For the same metric, e.g. "cycles", + * multiple perf-stat sessions share the same leader prog. + * + * Each perf-stat session creates a follower prog as fexit program to the + * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38) + * follower progs to the same leader prog. The follower prog checks current + * task and processor ID to decide whether to add the value from the diff + * map to its accumulated reading map (accum_readings). + * + * Finally, perf-stat user space reads the value from accum_reading map. + * + * Besides context switch, it is also necessary to trigger the leader prog + * before perf-stat reads the value. Otherwise, the accum_reading map may + * not have the latest reading from the perf_events. This is achieved by + * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU. + * + * Comment before the definition of struct perf_event_attr_map_entry + * describes how different sessions of perf-stat share information about + * the leader prog. + */ + +struct bpf_counter_ops bperf_ops = { + .load = bperf__load, + .enable = bperf__enable, + .disable = bperf__disable, + .read = bperf__read, + .install_pe = bperf__install_pe, + .destroy = bperf__destroy, +}; + +extern struct bpf_counter_ops bperf_cgrp_ops; + +static inline bool bpf_counter_skip(struct evsel *evsel) +{ + return list_empty(&evsel->bpf_counter_list) && + evsel->follower_skel == NULL; +} + +int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd) +{ + if (bpf_counter_skip(evsel)) + return 0; + return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd); +} + +int bpf_counter__load(struct evsel *evsel, struct target *target) +{ + if (target->bpf_str) + evsel->bpf_counter_ops = &bpf_program_profiler_ops; + else if (cgrp_event_expanded && target->use_bpf) + evsel->bpf_counter_ops = &bperf_cgrp_ops; + else if (target->use_bpf || evsel->bpf_counter || + evsel__match_bpf_counter_events(evsel->name)) + evsel->bpf_counter_ops = &bperf_ops; + + if (evsel->bpf_counter_ops) + return evsel->bpf_counter_ops->load(evsel, target); + return 0; +} + +int bpf_counter__enable(struct evsel *evsel) +{ + if (bpf_counter_skip(evsel)) + return 0; + return evsel->bpf_counter_ops->enable(evsel); +} + +int bpf_counter__disable(struct evsel *evsel) +{ + if (bpf_counter_skip(evsel)) + return 0; + return evsel->bpf_counter_ops->disable(evsel); +} + +int bpf_counter__read(struct evsel *evsel) +{ + if (bpf_counter_skip(evsel)) + return -EAGAIN; + return evsel->bpf_counter_ops->read(evsel); +} + +void bpf_counter__destroy(struct evsel *evsel) +{ + if (bpf_counter_skip(evsel)) + return; + evsel->bpf_counter_ops->destroy(evsel); + evsel->bpf_counter_ops = NULL; +} diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h new file mode 100644 index 0000000000000000000000000000000000000000..99c505ee241c628efa7704fd68b743e9b3a9acc8 --- /dev/null +++ b/tools/perf/util/bpf_counter.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_BPF_COUNTER_H +#define __PERF_BPF_COUNTER_H 1 + +#include +#include + +#ifdef HAVE_LIBBPF_SUPPORT +#include +#include +#include +#endif + +struct evsel; +struct target; +struct bpf_counter; + +typedef int (*bpf_counter_evsel_op)(struct evsel *evsel); +typedef int (*bpf_counter_evsel_target_op)(struct evsel *evsel, + struct target *target); +typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel, + int cpu, + int fd); + +struct bpf_counter_ops { + bpf_counter_evsel_target_op load; + bpf_counter_evsel_op enable; + bpf_counter_evsel_op disable; + bpf_counter_evsel_op read; + bpf_counter_evsel_op destroy; + bpf_counter_evsel_install_pe_op install_pe; +}; + +struct bpf_counter { + void *skel; + struct list_head list; +}; + +#ifdef HAVE_BPF_SKEL + +int bpf_counter__load(struct evsel *evsel, struct target *target); +int bpf_counter__enable(struct evsel *evsel); +int bpf_counter__disable(struct evsel *evsel); +int bpf_counter__read(struct evsel *evsel); +void bpf_counter__destroy(struct evsel *evsel); +int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd); + +#else /* HAVE_BPF_SKEL */ + +#include + +static inline int bpf_counter__load(struct evsel *evsel __maybe_unused, + struct target *target __maybe_unused) +{ + return 0; +} + +static inline int bpf_counter__enable(struct evsel *evsel __maybe_unused) +{ + return 0; +} + +static inline int bpf_counter__disable(struct evsel *evsel __maybe_unused) +{ + return 0; +} + +static inline int bpf_counter__read(struct evsel *evsel __maybe_unused) +{ + return -EAGAIN; +} + +static inline void bpf_counter__destroy(struct evsel *evsel __maybe_unused) +{ +} + +static inline int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, + int cpu __maybe_unused, + int fd __maybe_unused) +{ + return 0; +} + +#endif /* HAVE_BPF_SKEL */ + +static inline void set_max_rlimit(void) +{ + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; + + setrlimit(RLIMIT_MEMLOCK, &rinf); +} + +#ifdef HAVE_BPF_SKEL + +static inline __u32 bpf_link_get_id(int fd) +{ + struct bpf_link_info link_info = { .id = 0, }; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.id; +} + +static inline __u32 bpf_link_get_prog_id(int fd) +{ + struct bpf_link_info link_info = { .id = 0, }; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.prog_id; +} + +static inline __u32 bpf_map_get_id(int fd) +{ + struct bpf_map_info map_info = { .id = 0, }; + __u32 map_info_len = sizeof(map_info); + + bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); + return map_info.id; +} + +/* trigger the leader program on a cpu */ +static inline int bperf_trigger_reading(int prog_fd, int cpu) +{ + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = NULL, + .ctx_size_in = 0, + .flags = BPF_F_TEST_RUN_ON_CPU, + .cpu = cpu, + .retval = 0, + ); + + return bpf_prog_test_run_opts(prog_fd, &opts); +} +#endif /* HAVE_BPF_SKEL */ + +#endif /* __PERF_BPF_COUNTER_H */ diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c new file mode 100644 index 0000000000000000000000000000000000000000..11c0ebadf3f5e78c6d9460de66c46400928ea2c8 --- /dev/null +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Copyright (c) 2021 Facebook */ +/* Copyright (c) 2021 Google */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "affinity.h" +#include "bpf_counter.h" +#include "cgroup.h" +#include "counts.h" +#include "debug.h" +#include "evsel.h" +#include "evlist.h" +#include "target.h" +#include "cpumap.h" +#include "thread_map.h" + +#include "bpf_skel/bperf_cgroup.skel.h" + +static struct perf_event_attr cgrp_switch_attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CGROUP_SWITCHES, + .size = sizeof(cgrp_switch_attr), + .sample_period = 1, + .disabled = 1, +}; + +static struct evsel *cgrp_switch; +static struct bperf_cgroup_bpf *skel; + +#define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0)) + +static int bperf_load_program(struct evlist *evlist) +{ + struct bpf_link *link; + struct evsel *evsel; + struct cgroup *cgrp, *leader_cgrp; + __u32 i, cpu; + __u32 nr_cpus = evlist->core.all_cpus->nr; + int total_cpus = cpu__max_cpu(); + int map_size, map_fd; + int prog_fd, err; + + skel = bperf_cgroup_bpf__open(); + if (!skel) { + pr_err("Failed to open cgroup skeleton\n"); + return -1; + } + + skel->rodata->num_cpus = total_cpus; + skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups; + + BUG_ON(evlist->core.nr_entries % nr_cgroups != 0); + + /* we need one copy of events per cpu for reading */ + map_size = total_cpus * evlist->core.nr_entries / nr_cgroups; + bpf_map__resize(skel->maps.events, map_size); + bpf_map__resize(skel->maps.cgrp_idx, nr_cgroups); + /* previous result is saved in a per-cpu array */ + map_size = evlist->core.nr_entries / nr_cgroups; + bpf_map__resize(skel->maps.prev_readings, map_size); + /* cgroup result needs all events (per-cpu) */ + map_size = evlist->core.nr_entries; + bpf_map__resize(skel->maps.cgrp_readings, map_size); + + set_max_rlimit(); + + err = bperf_cgroup_bpf__load(skel); + if (err) { + pr_err("Failed to load cgroup skeleton\n"); + goto out; + } + + if (cgroup_is_v2("perf_event") > 0) + skel->bss->use_cgroup_v2 = 1; + + err = -1; + + cgrp_switch = evsel__new(&cgrp_switch_attr); + if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) { + pr_err("Failed to open cgroup switches event\n"); + goto out; + } + + for (i = 0; i < nr_cpus; i++) { + link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch, + FD(cgrp_switch, i)); + if (IS_ERR(link)) { + pr_err("Failed to attach cgroup program\n"); + err = PTR_ERR(link); + goto out; + } + } + + /* + * Update cgrp_idx map from cgroup-id to event index. + */ + cgrp = NULL; + i = 0; + + evlist__for_each_entry(evlist, evsel) { + if (cgrp == NULL || evsel->cgrp == leader_cgrp) { + leader_cgrp = evsel->cgrp; + evsel->cgrp = NULL; + + /* open single copy of the events w/o cgroup */ + err = evsel__open_per_cpu(evsel, evlist->core.all_cpus, -1); + if (err == 0) + evsel->supported = true; + + map_fd = bpf_map__fd(skel->maps.events); + for (cpu = 0; cpu < nr_cpus; cpu++) { + int fd = FD(evsel, cpu); + __u32 idx = evsel->idx * total_cpus + + evlist->core.all_cpus->map[cpu]; + + bpf_map_update_elem(map_fd, &idx, &fd, BPF_ANY); + } + + evsel->cgrp = leader_cgrp; + } + + if (evsel->cgrp == cgrp) + continue; + + cgrp = evsel->cgrp; + + if (read_cgroup_id(cgrp) < 0) { + pr_err("Failed to get cgroup id\n"); + err = -1; + goto out; + } + + map_fd = bpf_map__fd(skel->maps.cgrp_idx); + err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY); + if (err < 0) { + pr_err("Failed to update cgroup index map\n"); + goto out; + } + + i++; + } + + /* + * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check + * whether the kernel support it + */ + prog_fd = bpf_program__fd(skel->progs.trigger_read); + err = bperf_trigger_reading(prog_fd, 0); + if (err) { + pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n" + "Therefore, --for-each-cgroup might show inaccurate readings\n"); + err = 0; + } + +out: + return err; +} + +static int bperf_cgrp__load(struct evsel *evsel, + struct target *target __maybe_unused) +{ + static bool bperf_loaded; + + evsel->bperf_leader_prog_fd = -1; + evsel->bperf_leader_link_fd = -1; + + if (!bperf_loaded && bperf_load_program(evsel->evlist)) + return -1; + + bperf_loaded = true; + /* just to bypass bpf_counter_skip() */ + evsel->follower_skel = (struct bperf_follower_bpf *)skel; + + return 0; +} + +static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused, + int cpu __maybe_unused, int fd __maybe_unused) +{ + /* nothing to do */ + return 0; +} + +/* + * trigger the leader prog on each cpu, so the cgrp_reading map could get + * the latest results. + */ +static int bperf_cgrp__sync_counters(struct evlist *evlist) +{ + int i, cpu; + int nr_cpus = evlist->core.all_cpus->nr; + int prog_fd = bpf_program__fd(skel->progs.trigger_read); + + for (i = 0; i < nr_cpus; i++) { + cpu = evlist->core.all_cpus->map[i]; + bperf_trigger_reading(prog_fd, cpu); + } + + return 0; +} + +static int bperf_cgrp__enable(struct evsel *evsel) +{ + if (evsel->idx) + return 0; + + bperf_cgrp__sync_counters(evsel->evlist); + + skel->bss->enabled = 1; + return 0; +} + +static int bperf_cgrp__disable(struct evsel *evsel) +{ + if (evsel->idx) + return 0; + + bperf_cgrp__sync_counters(evsel->evlist); + + skel->bss->enabled = 0; + return 0; +} + +static int bperf_cgrp__read(struct evsel *evsel) +{ + struct evlist *evlist = evsel->evlist; + int i, cpu, nr_cpus = evlist->core.all_cpus->nr; + int total_cpus = cpu__max_cpu(); + struct perf_counts_values *counts; + struct bpf_perf_event_value *values; + int reading_map_fd, err = 0; + __u32 idx; + + if (evsel->idx) + return 0; + + bperf_cgrp__sync_counters(evsel->evlist); + + values = calloc(total_cpus, sizeof(*values)); + if (values == NULL) + return -ENOMEM; + + reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings); + + evlist__for_each_entry(evlist, evsel) { + idx = evsel->idx; + err = bpf_map_lookup_elem(reading_map_fd, &idx, values); + if (err) { + pr_err("bpf map lookup falied: idx=%u, event=%s, cgrp=%s\n", + idx, evsel__name(evsel), evsel->cgrp->name); + goto out; + } + + for (i = 0; i < nr_cpus; i++) { + cpu = evlist->core.all_cpus->map[i]; + + counts = perf_counts(evsel->counts, i, 0); + counts->val = values[cpu].counter; + counts->ena = values[cpu].enabled; + counts->run = values[cpu].running; + } + } + +out: + free(values); + return err; +} + +static int bperf_cgrp__destroy(struct evsel *evsel) +{ + if (evsel->idx) + return 0; + + bperf_cgroup_bpf__destroy(skel); + evsel__delete(cgrp_switch); // it'll destroy on_switch progs too + + return 0; +} + +struct bpf_counter_ops bperf_cgrp_ops = { + .load = bperf_cgrp__load, + .enable = bperf_cgrp__enable, + .disable = bperf_cgrp__disable, + .read = bperf_cgrp__read, + .install_pe = bperf_cgrp__install_pe, + .destroy = bperf_cgrp__destroy, +}; diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5263e9e6c5d83a36cb002e310bd378f83dd0f8b6 --- /dev/null +++ b/tools/perf/util/bpf_skel/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +.tmp +*.skel.h \ No newline at end of file diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..ddda1582f04a2f091bbc3ccefb3406c7caefddd1 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +// Copyright (c) 2021 Google +#include "vmlinux.h" +#include +#include +#include + +#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary +#define MAX_EVENTS 32 // max events per cgroup: arbitrary + +// NOTE: many of map and global data will be modified before loading +// from the userspace (perf tool) using the skeleton helpers. + +// single set of global perf events to measure +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1); +} events SEC(".maps"); + +// from cgroup id to event index +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 1); +} cgrp_idx SEC(".maps"); + +// per-cpu event snapshots to calculate delta +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} prev_readings SEC(".maps"); + +// aggregated event values for each cgroup (per-cpu) +// will be read from the user-space +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} cgrp_readings SEC(".maps"); + +const volatile __u32 num_events = 1; +const volatile __u32 num_cpus = 1; + +int enabled; +int use_cgroup_v2; + +static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) +{ + struct task_struct *p = (void *)bpf_get_current_task(); + struct cgroup *cgrp; + register int i = 0; + __u32 *elem; + int level; + int cnt; + + cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup); + level = BPF_CORE_READ(cgrp, level); + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id; + + if (i > level) + break; + + // convert cgroup-id to a map index + cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) +{ + register int i = 0; + __u32 *elem; + int cnt; + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); + + if (cgrp_id == 0) + break; + + // convert cgroup-id to a map index + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static int bperf_cgroup_count(void) +{ + register __u32 idx = 0; // to have it in a register to pass BPF verifier + register int c = 0; + struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; + __u32 cpu = bpf_get_smp_processor_id(); + __u32 cgrp_idx[MAX_LEVELS]; + int cgrp_cnt; + __u32 key, cgrp; + long err; + + if (use_cgroup_v2) + cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); + else + cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); + + for ( ; idx < MAX_EVENTS; idx++) { + if (idx == num_events) + break; + + // XXX: do not pass idx directly (for verifier) + key = idx; + // this is per-cpu array for diff + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) { + val.counter = val.enabled = val.running = 0; + bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); + + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) + continue; + } + + // read from global perf_event array + key = idx * num_cpus + cpu; + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + continue; + + if (enabled) { + delta.counter = val.counter - prev_val->counter; + delta.enabled = val.enabled - prev_val->enabled; + delta.running = val.running - prev_val->running; + + for (c = 0; c < MAX_LEVELS; c++) { + if (c == cgrp_cnt) + break; + + cgrp = cgrp_idx[c]; + + // aggregate the result by cgroup + key = cgrp * num_events + idx; + cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); + if (cgrp_val) { + cgrp_val->counter += delta.counter; + cgrp_val->enabled += delta.enabled; + cgrp_val->running += delta.running; + } else { + bpf_map_update_elem(&cgrp_readings, &key, + &delta, BPF_ANY); + } + } + } + + *prev_val = val; + } + return 0; +} + +// This will be attached to cgroup-switches event for each cpu +SEC("perf_event") +int BPF_PROG(on_cgrp_switch) +{ + return bperf_cgroup_count(); +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(trigger_read) +{ + return bperf_cgroup_count(); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..8821b9f7d097376350daa8f14ae2bd30f517a3da --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include +#include +#include "bperf_u.h" + +#define MAX_ENTRIES 102400 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bperf_filter_value)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} filter SEC(".maps"); + +<<<<<<< HEAD +enum bperf_filter_type type; +int enabled; +======= +enum bperf_filter_type type = 0; +int enabled = 0; +int inherit; +>>>>>>> 8cb17f4d8605 (perf stat: Support inherit events during fork() for bperf) + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value *diff_val, *accum_val; + __u32 filter_key, zero = 0; + __u32 accum_key; + struct bperf_filter_value *fval; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_GLOBAL: + accum_key = zero; + goto do_add; + case BPERF_FILTER_CPU: + filter_key = bpf_get_smp_processor_id(); + break; + case BPERF_FILTER_PID: + filter_key = bpf_get_current_pid_tgid() & 0xffffffff; + break; + case BPERF_FILTER_TGID: + /* Use pid as the filter_key to exclude new task counts + * when inherit is disabled. Don't worry about the existing + * children in TGID losing their counts, bpf_counter has + * already added them to the filter map via perf_thread_map + * before this bpf prog runs. + */ + filter_key = inherit ? + bpf_get_current_pid_tgid() >> 32 : + bpf_get_current_pid_tgid() & 0xffffffff; + break; + default: + return 0; + } + + fval = bpf_map_lookup_elem(&filter, &filter_key); + if (!fval) + return 0; + + accum_key = fval->accum_key; + if (fval->exited) + bpf_map_delete_elem(&filter, &filter_key); + +do_add: + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key); + if (!accum_val) + return 0; + + accum_val->counter += diff_val->counter; + accum_val->enabled += diff_val->enabled; + accum_val->running += diff_val->running; + + return 0; +} + +/* The program is only used for PID or TGID filter types. */ +SEC("tp_btf/task_newtask") +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags) +{ + __u32 parent_key, child_key; + struct bperf_filter_value *parent_fval; + struct bperf_filter_value child_fval = { 0 }; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_PID: + parent_key = bpf_get_current_pid_tgid() & 0xffffffff; + child_key = task->pid; + break; + case BPERF_FILTER_TGID: + parent_key = bpf_get_current_pid_tgid() >> 32; + child_key = task->tgid; + if (child_key == parent_key) + return 0; + break; + default: + return 0; + } + + /* Check if the current task is one of the target tasks to be counted */ + parent_fval = bpf_map_lookup_elem(&filter, &parent_key); + if (!parent_fval) + return 0; + + /* Start counting for the new task by adding it into filter map, + * inherit the accum key of its parent task so that they can be + * counted together. + */ + child_fval.accum_key = parent_fval->accum_key; + child_fval.exited = 0; + bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST); + + return 0; +} + +/* The program is only used for PID or TGID filter types. */ +SEC("tp_btf/sched_process_exit") +int BPF_PROG(on_exittask, struct task_struct *task) +{ + __u32 pid; + struct bperf_filter_value *fval; + + if (!enabled) + return 0; + + /* Stop counting for this task by removing it from filter map. + * For TGID type, if the pid can be found in the map, it means that + * this pid belongs to the leader task. After the task exits, the + * tgid of its child tasks (if any) will be 1, so the pid can be + * safely removed. + */ + pid = task->pid; + fval = bpf_map_lookup_elem(&filter, &pid); + if (fval) + fval->exited = 1; + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..e2a2d4cd7779ce703618fd366fce63bddf871bbe --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} prev_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +SEC("raw_tp/sched_switch") +int BPF_PROG(on_switch) +{ + struct bpf_perf_event_value val, *prev_val, *diff_val; + __u32 key = bpf_get_smp_processor_id(); + __u32 zero = 0; + long err; + + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); + if (!prev_val) + return 0; + + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + return 0; + + diff_val->counter = val.counter - prev_val->counter; + diff_val->enabled = val.enabled - prev_val->enabled; + diff_val->running = val.running - prev_val->running; + *prev_val = val; + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h new file mode 100644 index 0000000000000000000000000000000000000000..4a4a753980bef8f03c82f9fbe529757866d4f8ca --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_u.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook + +#ifndef __BPERF_STAT_U_H +#define __BPERF_STAT_U_H + +enum bperf_filter_type { + BPERF_FILTER_GLOBAL = 1, + BPERF_FILTER_CPU, + BPERF_FILTER_PID, + BPERF_FILTER_TGID, +}; + +struct bperf_filter_value { + __u32 accum_key; + __u8 exited; +}; + +#endif /* __BPERF_STAT_U_H */ diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..72a5131e0eb91017973002661ff0ddfbfdaef3ae --- /dev/null +++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2020 Facebook +#include "vmlinux.h" +#include +#include + +/* map of perf event fds, num_cpu * num_metric entries */ +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +/* readings at fentry */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} fentry_readings SEC(".maps"); + +/* accumulated readings */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +const volatile __u32 num_cpu = 1; + +SEC("fentry/XXX") +int BPF_PROG(fentry_XXX) +{ + __u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *ptr; + __u32 zero = 0; + long err; + + /* look up before reading, to reduce error */ + ptr = bpf_map_lookup_elem(&fentry_readings, &zero); + if (!ptr) + return 0; + + err = bpf_perf_event_read_value(&events, key, ptr, sizeof(*ptr)); + if (err) + return 0; + + return 0; +} + +static inline void +fexit_update_maps(struct bpf_perf_event_value *after) +{ + struct bpf_perf_event_value *before, diff, *accum; + __u32 zero = 0; + + before = bpf_map_lookup_elem(&fentry_readings, &zero); + /* only account samples with a valid fentry_reading */ + if (before && before->counter) { + struct bpf_perf_event_value *accum; + + diff.counter = after->counter - before->counter; + diff.enabled = after->enabled - before->enabled; + diff.running = after->running - before->running; + + accum = bpf_map_lookup_elem(&accum_readings, &zero); + if (accum) { + accum->counter += diff.counter; + accum->enabled += diff.enabled; + accum->running += diff.running; + } + } +} + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value reading; + __u32 cpu = bpf_get_smp_processor_id(); + __u32 one = 1, zero = 0; + int err; + + /* read all events before updating the maps, to reduce error */ + err = bpf_perf_event_read_value(&events, cpu, &reading, sizeof(reading)); + if (err) + return 0; + + fexit_update_maps(&reading); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index b81324a13a2b69668d8e192e6bd4c804955826a0..22bd8c97338a20a3b9fa56934fc5743de1e651aa 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -9,12 +9,14 @@ #include #include #include +#include #include #include #include #include int nr_cgroups; +bool cgrp_event_expanded; static int open_cgroup(const char *name) { @@ -35,6 +37,49 @@ static int open_cgroup(const char *name) return fd; } +#ifdef HAVE_FILE_HANDLE +int read_cgroup_id(struct cgroup *cgrp) +{ + char path[PATH_MAX + 1]; + char mnt[PATH_MAX + 1]; + struct { + struct file_handle fh; + uint64_t cgroup_id; + } handle; + int mount_id; + + if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1, "perf_event")) + return -1; + + scnprintf(path, PATH_MAX, "%s/%s", mnt, cgrp->name); + + handle.fh.handle_bytes = sizeof(handle.cgroup_id); + if (name_to_handle_at(AT_FDCWD, path, &handle.fh, &mount_id, 0) < 0) + return -1; + + cgrp->id = handle.cgroup_id; + return 0; +} +#endif /* HAVE_FILE_HANDLE */ + +#ifndef CGROUP2_SUPER_MAGIC +#define CGROUP2_SUPER_MAGIC 0x63677270 +#endif + +int cgroup_is_v2(const char *subsys) +{ + char mnt[PATH_MAX + 1]; + struct statfs stbuf; + + if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1, subsys)) + return -1; + + if (statfs(mnt, &stbuf) < 0) + return -1; + + return (stbuf.f_type == CGROUP2_SUPER_MAGIC); +} + static struct cgroup *evlist__find_cgroup(struct evlist *evlist, const char *str) { struct evsel *counter; @@ -290,6 +335,8 @@ int evlist__expand_cgroup(struct evlist *evlist, const char *str, str = p+1; } + cgrp_event_expanded = true; + out_err: evlist__delete(orig_list); evlist__delete(tmp_list); diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h index 162906f3412ae3b42f322425da3a4d0b5fe52119..12256b78608c2f9d0ce89e169bd3581795d3fca9 100644 --- a/tools/perf/util/cgroup.h +++ b/tools/perf/util/cgroup.h @@ -2,6 +2,7 @@ #ifndef __CGROUP_H__ #define __CGROUP_H__ +#include #include #include #include "util/env.h" @@ -17,6 +18,7 @@ struct cgroup { }; extern int nr_cgroups; /* number of explicit cgroups defined */ +extern bool cgrp_event_expanded; struct cgroup *cgroup__get(struct cgroup *cgroup); void cgroup__put(struct cgroup *cgroup); @@ -38,4 +40,15 @@ struct cgroup *cgroup__find(struct perf_env *env, uint64_t id); void perf_env__purge_cgroups(struct perf_env *env); +#ifdef HAVE_FILE_HANDLE +int read_cgroup_id(struct cgroup *cgrp); +#else +static inline int read_cgroup_id(struct cgroup *cgrp __maybe_unused) +{ + return -1; +} +#endif /* HAVE_FILE_HANDLE */ + +int cgroup_is_v2(const char *subsys); + #endif /* __CGROUP_H__ */ diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 6969f82843eeff0c646d5328ed5b2c165e22d907..9d3f61579ed093e7e6dc38bdc52d5109e5532bac 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -18,6 +18,7 @@ #include "util/hist.h" /* perf_hist_config */ #include "util/llvm-utils.h" /* perf_llvm_config */ #include "util/stat.h" /* perf_stat__set_big_num */ +#include "util/evsel.h" /* evsel__hw_names, evsel__use_bpf_counters */ #include "build-id.h" #include "debug.h" #include "config.h" @@ -457,6 +458,9 @@ static int perf_stat_config(const char *var, const char *value) if (!strcmp(var, "stat.big-num")) perf_stat__set_big_num(perf_config_bool(var, value)); + if (!strcmp(var, "stat.bpf-counter-events")) + evsel__bpf_counter_events = strdup(value); + /* Add other config variables here. */ return 0; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 13dca5cfd63a99dd19bd0e030f7baf391821e2da..7240469ee1d60ce12b6583e76dfd190d6e7c3307 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -17,6 +17,7 @@ #include "evsel.h" #include "debug.h" #include "units.h" +#include "bpf_counter.h" #include // page_size #include "affinity.h" #include "../perf.h" @@ -397,6 +398,9 @@ void evlist__disable(struct evlist *evlist) if (affinity__setup(&affinity) < 0) return; + evlist__for_each_entry(evlist, pos) + bpf_counter__disable(pos); + /* Disable 'immediate' events last */ for (imm = 0; imm <= 1; imm++) { evlist__for_each_cpu(evlist, i, cpu) { diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 12daf40ae89be74d6df2d830763606dac90eb57d..dda5e0301282f00e2afef99015cfe9800386f255 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -25,6 +25,7 @@ #include #include #include "asm/bug.h" +#include "bpf_counter.h" #include "callchain.h" #include "cgroup.h" #include "counts.h" @@ -247,6 +248,7 @@ void evsel__init(struct evsel *evsel, evsel->bpf_obj = NULL; evsel->bpf_fd = -1; INIT_LIST_HEAD(&evsel->config_terms); + INIT_LIST_HEAD(&evsel->bpf_counter_list); perf_evsel__object.init(evsel); evsel->sample_size = __evsel__sample_size(attr->sample_type); evsel__calc_id_pos(evsel); @@ -489,6 +491,28 @@ const char *evsel__hw_names[PERF_COUNT_HW_MAX] = { "ref-cycles", }; +char *evsel__bpf_counter_events; + +bool evsel__match_bpf_counter_events(const char *name) +{ + int name_len; + bool match; + char *ptr; + + if (!evsel__bpf_counter_events) + return false; + + ptr = strstr(evsel__bpf_counter_events, name); + name_len = strlen(name); + + /* check name matches a full token in evsel__bpf_counter_events */ + match = (ptr != NULL) && + ((ptr == evsel__bpf_counter_events) || (*(ptr - 1) == ',')) && + ((*(ptr + name_len) == ',') || (*(ptr + name_len) == '\0')); + + return match; +} + static const char *__evsel__hw_name(u64 config) { if (config < PERF_COUNT_HW_MAX && evsel__hw_names[config]) @@ -1392,6 +1416,7 @@ void evsel__exit(struct evsel *evsel) { assert(list_empty(&evsel->core.node)); assert(evsel->evlist == NULL); + bpf_counter__destroy(evsel); evsel__free_counts(evsel); perf_evsel__free_fd(&evsel->core); perf_evsel__free_id(&evsel->core); @@ -1819,6 +1844,8 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, FD(evsel, cpu, thread) = fd; + bpf_counter__install_pe(evsel, cpu, fd); + if (unlikely(test_attr__enabled)) { test_attr__open(&evsel->core.attr, pid, cpus->map[cpu].cpu, fd, group_fd, flags); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b0d1c67f7617a28df5efb7fbb5c4762f851da3ed..ce67bdbbeb4eba41960e93c3bfe9e04aba09f289 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -17,6 +17,10 @@ struct cgroup; struct perf_counts; struct perf_stat_evsel; union perf_event; +struct bpf_counter_ops; +struct target; +struct bperf_leader_bpf; +struct bperf_follower_bpf; typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); @@ -77,6 +81,7 @@ struct evsel { bool auto_merge_stats; bool collect_stat; bool weak_group; + bool bpf_counter; int bpf_fd; struct bpf_object *bpf_obj; }; @@ -128,6 +133,24 @@ struct evsel { * See also evsel__has_callchain(). */ __u64 synth_sample_type; + + /* + * bpf_counter_ops serves two use cases: + * 1. perf-stat -b counting events used byBPF programs + * 2. perf-stat --use-bpf use BPF programs to aggregate counts + */ + struct bpf_counter_ops *bpf_counter_ops; + + /* for perf-stat -b */ + struct list_head bpf_counter_list; + + /* for perf-stat --use-bpf */ + int bperf_leader_prog_fd; + int bperf_leader_link_fd; + union { + struct bperf_leader_bpf *leader_skel; + struct bperf_follower_bpf *follower_skel; + }; }; struct perf_missing_features { @@ -218,6 +241,16 @@ void evsel__calc_id_pos(struct evsel *evsel); bool evsel__is_cache_op_valid(u8 type, u8 op); +static inline bool evsel__is_bpf(struct evsel *evsel) +{ + return evsel->bpf_counter_ops != NULL; +} + +static inline bool evsel__is_bperf(struct evsel *evsel) +{ + return evsel->bpf_counter_ops != NULL && list_empty(&evsel->bpf_counter_list); +} + #define EVSEL__MAX_ALIASES 8 extern const char *evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX][EVSEL__MAX_ALIASES]; @@ -225,6 +258,9 @@ extern const char *evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][EVSEL__MAX_ALI extern const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_ALIASES]; extern const char *evsel__hw_names[PERF_COUNT_HW_MAX]; extern const char *evsel__sw_names[PERF_COUNT_SW_MAX]; +extern char *evsel__bpf_counter_events; +bool evsel__match_bpf_counter_events(const char *name); + int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size); const char *evsel__name(struct evsel *evsel); @@ -430,4 +466,5 @@ static inline bool evsel__is_dummy_event(struct evsel *evsel) struct perf_env *evsel__env(struct evsel *evsel); int evsel__store_ids(struct evsel *evsel, struct evlist *evlist); + #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4f988f768697cfcc88f2bd72750b839d1e99bd7e..afc0636d6f46c90fc41c4279a3c4961f5b7d33d3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -145,6 +145,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { .symbol = "bpf-output", .alias = "", }, + [PERF_COUNT_SW_CGROUP_SWITCHES] = { + .symbol = "cgroup-switches", + .alias = "", + }, }; #define __PERF_EVENT_FIELD(config, name) \ @@ -1800,6 +1804,7 @@ struct event_modifier { int pinned; int weak; int exclusive; + int bpf_counter; }; static int get_event_modifier(struct event_modifier *mod, char *str, @@ -1820,6 +1825,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, int exclude = eu | ek | eh; int exclude_GH = evsel ? evsel->exclude_GH : 0; int weak = 0; + int bpf_counter = 0; memset(mod, 0, sizeof(*mod)); @@ -1863,6 +1869,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str, exclusive = 1; } else if (*str == 'W') { weak = 1; + } else if (*str == 'b') { + bpf_counter = 1; } else break; @@ -1894,6 +1902,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, mod->sample_read = sample_read; mod->pinned = pinned; mod->weak = weak; + mod->bpf_counter = bpf_counter; mod->exclusive = exclusive; return 0; @@ -1908,7 +1917,7 @@ static int check_modifier(char *str) char *p = str; /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppPSDIWe") - 1)) + if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1)) return -1; while (*p) { @@ -1949,6 +1958,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add) evsel->sample_read = mod.sample_read; evsel->precise_max = mod.precise_max; evsel->weak_group = mod.weak; + evsel->bpf_counter = mod.bpf_counter; if (evsel__is_group_leader(evsel)) { evsel->core.attr.pinned = mod.pinned; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 45b1aaf32e25158d4ed68cae423914fb38f6685c..682e39ee369e6407238fac132fcf297badb561b5 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -212,7 +212,7 @@ name_tag [\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\'] name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?.:]* drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* If you add a modifier you need to update check_modifier() */ -modifier_event [ukhpPGHSDIWe]+ +modifier_event [ukhpPGHSDIWeb]+ modifier_bp [rwx]{1,3} %% @@ -349,6 +349,7 @@ emulation-faults { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM dummy { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); } duration_time { return tool(yyscanner, PERF_TOOL_DURATION_TIME); } bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); } +cgroup-switches { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CGROUP_SWITCHES); } /* * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately. diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index ab146924981af5686091e7b39220d92537764aaa..371fa74718b2133d32a629abe65306b45662c47a 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -79,6 +79,33 @@ int metricgroup__copy_metric_events(struct evlist *evlist, struct cgroup *cgrp, return 0; } +/* + * XXX: All these evsel destructors need some better mechanism, like a linked + * list of destructors registered when the relevant code indeed is used instead + * of having more and more calls in perf_evsel__delete(). -- acme + * + * For now, add some more: + * + * Not to drag the BPF bandwagon... + */ +void bpf_counter__destroy(struct evsel *evsel); +int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd); +int bpf_counter__disable(struct evsel *evsel); + +void bpf_counter__destroy(struct evsel *evsel __maybe_unused) +{ +} + +int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, int cpu __maybe_unused, int fd __maybe_unused) +{ + return 0; +} + +int bpf_counter__disable(struct evsel *evsel __maybe_unused) +{ + return 0; +} + /* * Support debug printing even though util/debug.c is not linked. That means * implementing 'verbose' and 'eprintf'. diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index e84206f957b2357b7362959e9aa9c92b7a0b53f6..191e1b702b55c24dc964b7f7f09434f29d5bdb32 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -1027,7 +1027,9 @@ static void print_header(struct perf_stat_config *config, if (!config->csv_output) { fprintf(output, "\n"); fprintf(output, " Performance counter stats for "); - if (_target->system_wide) + if (_target->bpf_str) + fprintf(output, "\'BPF program(s) %s", _target->bpf_str); + else if (_target->system_wide) fprintf(output, "\'system wide"); else if (_target->cpu_list) fprintf(output, "\'CPU(s) %s", _target->cpu_list); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index be326227b05563ea66aef03136f7dca211f6a0b2..e95dc23dd478239405a4dac031c6d9aa191f6876 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -527,7 +527,7 @@ int create_perf_stat_counter(struct evsel *evsel, if (leader->core.nr_members > 1) attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP; - attr->inherit = !config->no_inherit; + attr->inherit = !config->no_inherit && list_empty(&evsel->bpf_counter_list); /* * Some events get initialized with sample_(period/type) set, diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c index a3db13dea937cc40061bff4a3bc29e6b45089a61..0f383418e3df5e568fe8459253f851ddf32e947a 100644 --- a/tools/perf/util/target.c +++ b/tools/perf/util/target.c @@ -56,6 +56,34 @@ enum target_errno target__validate(struct target *target) ret = TARGET_ERRNO__UID_OVERRIDE_SYSTEM; } + /* BPF and CPU are mutually exclusive */ + if (target->bpf_str && target->cpu_list) { + target->cpu_list = NULL; + if (ret == TARGET_ERRNO__SUCCESS) + ret = TARGET_ERRNO__BPF_OVERRIDE_CPU; + } + + /* BPF and PID/TID are mutually exclusive */ + if (target->bpf_str && target->tid) { + target->tid = NULL; + if (ret == TARGET_ERRNO__SUCCESS) + ret = TARGET_ERRNO__BPF_OVERRIDE_PID; + } + + /* BPF and UID are mutually exclusive */ + if (target->bpf_str && target->uid_str) { + target->uid_str = NULL; + if (ret == TARGET_ERRNO__SUCCESS) + ret = TARGET_ERRNO__BPF_OVERRIDE_UID; + } + + /* BPF and THREADS are mutually exclusive */ + if (target->bpf_str && target->per_thread) { + target->per_thread = false; + if (ret == TARGET_ERRNO__SUCCESS) + ret = TARGET_ERRNO__BPF_OVERRIDE_THREAD; + } + /* THREAD and SYSTEM/CPU are mutually exclusive */ if (target->per_thread && (target->system_wide || target->cpu_list)) { target->per_thread = false; @@ -109,6 +137,10 @@ static const char *target__error_str[] = { "PID/TID switch overriding SYSTEM", "UID switch overriding SYSTEM", "SYSTEM/CPU switch overriding PER-THREAD", + "BPF switch overriding CPU", + "BPF switch overriding PID/TID", + "BPF switch overriding UID", + "BPF switch overriding THREAD", "Invalid User: %s", "Problems obtaining information for user %s", }; @@ -134,7 +166,7 @@ int target__strerror(struct target *target, int errnum, switch (errnum) { case TARGET_ERRNO__PID_OVERRIDE_CPU ... - TARGET_ERRNO__SYSTEM_OVERRIDE_THREAD: + TARGET_ERRNO__BPF_OVERRIDE_THREAD: snprintf(buf, buflen, "%s", msg); break; diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h index 6ef01a83b24e971cfeef0149610eb97514d8248d..f348344d7f34bfd7b6fe0d3b45ea5bf6f1301345 100644 --- a/tools/perf/util/target.h +++ b/tools/perf/util/target.h @@ -10,11 +10,15 @@ struct target { const char *tid; const char *cpu_list; const char *uid_str; + const char *bpf_str; uid_t uid; bool system_wide; bool uses_mmap; bool default_per_cpu; bool per_thread; + bool use_bpf; + bool inherit; + const char *attr_map; }; enum target_errno { @@ -36,6 +40,10 @@ enum target_errno { TARGET_ERRNO__PID_OVERRIDE_SYSTEM, TARGET_ERRNO__UID_OVERRIDE_SYSTEM, TARGET_ERRNO__SYSTEM_OVERRIDE_THREAD, + TARGET_ERRNO__BPF_OVERRIDE_CPU, + TARGET_ERRNO__BPF_OVERRIDE_PID, + TARGET_ERRNO__BPF_OVERRIDE_UID, + TARGET_ERRNO__BPF_OVERRIDE_THREAD, /* for target__parse_uid() */ TARGET_ERRNO__INVALID_UID, diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a3f99ef6b11ba660d1c0b81ef2887e41fd213c1e..397fe4787d16bb7da9831bf1e5d7a1962a8913e5 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -135,6 +135,7 @@ ifneq ($(silent),1) $(MAKE) $(PRINT_DIR) -C $$subdir QUIET_FLEX = @echo ' FLEX '$@; QUIET_BISON = @echo ' BISON '$@; + QUIET_GENSKEL = @echo ' GEN-SKEL '$@; descend = \ +@echo ' DESCEND '$(1); \