diff --git a/.gitmodules b/.gitmodules index e0fc23552c477cf77a1ae80900bf9ec489584f6e..549774f3487d77df3730391e8a23d037229df3ed 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "googletest"] path = third_party/googletest url = https://gitee.com/mirrors/googletest.git +[submodule "third_party/libbpf"] + path = third_party/libbpf + url = https://gitee.com/mirrors/libbpf.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f013847e946f13209dc6488db54862bd1f5294e6..95ced186753cd904c647c4c63b7dea56b8535e7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,4 +60,7 @@ if (INCLUDE_TEST) add_subdirectory(test) endif() +option(BPF "Enable BPF mode" OFF) +message(STATUS "BPF support: ${BPF}") + set(CMAKE_EXPORT_COMPILE_COMMANDS True) diff --git a/build.sh b/build.sh index 6fb130037dc5f0a97ba6ee0baa647336d08da515..079cef3cce1827a50f81f3bda3c62a00342aae5b 100644 --- a/build.sh +++ b/build.sh @@ -20,6 +20,7 @@ PROJECT_DIR=$(realpath "${CURRENT_DIR}") BUILD_DIR=${PROJECT_DIR}/_build THIRD_PARTY=${PROJECT_DIR}/third_party/ INSTALL_PATH=${PROJECT_DIR}/output/ +BPF_DIR=${PROJECT_DIR}/pmu/bpf BUILD_TYPE=Release # Python module are not compiled by default. PYTHON=false @@ -27,6 +28,8 @@ PYTHON=false INCLUDE_TEST=false # Go support, copy so and head files GO=false +# Bpf mode for counting +BPF=false source ${PROJECT_DIR}/build/common.sh @@ -66,6 +69,9 @@ for arg in "$@"; do go=*) GO="${arg#*=}" ;; + bpf=*) + BPF="${arg#*=}" + ;; esac done @@ -73,6 +79,11 @@ if [[ "$INCLUDE_TEST" == "true" ]]; then build_googletest $THIRD_PARTY fi +if [[ "$BPF" == "true" ]]; then + build_libbpf $THIRD_PARTY + build_skel_files $BPF_DIR $THIRD_PARTY +fi + function build_elfin() { local cmake_target_dir=$THIRD_PARTY/local/elfin-parser rm -rf ${cmake_target_dir} @@ -110,6 +121,7 @@ build_libkperf() "-DGO=${GO}" "-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" "-DCMAKE_BUILD_TYPE=${BUILD_TYPE}" + "-DBPF=${BPF}" ) if [ ! -z ${PYTHON_EXE} ];then CMAKE_ARGS+=("-DPYTHON_KPERF=${PYTHON_EXE}") diff --git a/build/common.sh b/build/common.sh index f48b64e7ad234c9bc0665c4acdda5a55ad6c15cc..0ce407ce2ac660b10a6012a0b351ec3da1e4d198 100644 --- a/build/common.sh +++ b/build/common.sh @@ -13,6 +13,9 @@ # Description: Partial methods for building scripts. set -e +export BPF_CLANG="clang" +export BPF_TOOL="bpftool" + cpu_core_num=$(($(nproc)-1)) if [ "$cpu_core_num" -eq 0 ];then @@ -76,4 +79,48 @@ function execute_binary() { echo "执行命令: $command" eval "$command" done -} \ No newline at end of file +} + +function build_libbpf() { + local open_source_dir=$1 + local cmake_target_dir=$1/bpf + if [ -d "${cmake_target_dir}" ];then + echo ${cmake_target_dir} "is exist" + return + else + echo ${cmake_target_dir} "is not exist" + fi + pushd "$open_source_dir/libbpf/src" + make -j ${cpu_core_num} + make install DESTDIR=$open_source_dir/local/bpf + echo "install log path: $cmake_target_dir" +} + +function build_skel_files() { + command -v $BPF_CLANG &> /dev/null || error_exit "Error: $BPF_CLANG not found. Please install LLVM/Clang." + command -v $BPF_TOOL &> /dev/null || error_exit "Error: $BPF_TOOL not found. Please install bpftool." + + local bpf_file_dir=$1 + local bpf_lib_dir=$2 + bpftool btf dump file /sys/kernel/btf/vmlinux format c > "${bpf_lib_dir}local/bpf/vmlinux.h" + if [ -s "${bpf_lib_dir}local/bpf/vmlinux.h" ]; then + echo "The kernel header file generated." + else + echo "Generate vmlinux.h file failed." + fi + + for bpf_src in "${bpf_file_dir}"/*.bpf.c; do + [ -f "$bpf_src" ] || continue + src_name=$(basename "${bpf_src%.bpf.c}") + obj_path="${bpf_file_dir}/${src_name}.bpf.o" + skel_path="${bpf_file_dir}/${src_name}.skel.h" + + echo "compile: $src_name" + clang -I${bpf_lib_dir}local -g -O2 -target bpf -c "$bpf_src" -o "$obj_path" + [ -s "$obj_path" ] || { echo "Error: The obj file was not generated."; exit 1; } + bpftool gen skeleton "$obj_path" > "$skel_path" + [ -s "$skel_path" ] || { echo "Error: The skeleton file was not generated."; exit 1; } + grep -q 'struct bpf_prog' "$skel_path" || { echo "Error: invalid skeleton format."; exit 1; } + echo "generate: ${src_name}.skel.h" + done +} diff --git a/pmu/CMakeLists.txt b/pmu/CMakeLists.txt index 90a149f54d8d912fdabaea6914cee4132dce0e02..4d7f7cf4f3b30fffeea4b3f95799f01c5007bcaf 100644 --- a/pmu/CMakeLists.txt +++ b/pmu/CMakeLists.txt @@ -13,6 +13,7 @@ set(SYMBOL_FILE_DIR ${PROJECT_TOP_DIR}/symbol) set(PMU_FILE_DIR ${PROJECT_TOP_DIR}/pmu) set(PFM_FILE_DIR ${PROJECT_TOP_DIR}/pmu/pfm) set(PMU_DECODER_DIR ${PMU_FILE_DIR}/decoder) +set(PMU_BPF_DIR ${PMU_FILE_DIR}/bpf) # Source files # file(GLOB UTIL_SRC ${UTIL_FILE_DIR}/*.cpp) @@ -20,6 +21,7 @@ file(GLOB PMU_SRC ${PMU_FILE_DIR}/*c ${PMU_FILE_DIR}/*cpp) file(GLOB PMU_DECODER_SRC ${PMU_DECODER_DIR}/*.cpp) file(GLOB SYMBOL_SRC ${SYMBOL_FILE_DIR}/*c ${SYMBOL_FILE_DIR}/*cpp) file(GLOB PFM_SRC ${PFM_FILE_DIR}/*c ${PFM_FILE_DIR}/*cpp) +file(GLOB BPF_SRC ${PMU_BPF_DIR}/*cpp) include_directories(${PROJECT_TOP_DIR}/include) include_directories(${PMU_FILE_DIR}/) @@ -30,8 +32,18 @@ include_directories(${UTIL_FILE_DIR}) include_directories(${SYMBOL_FILE_DIR}) include_directories(${PMU_DECODER_DIR}) -ADD_LIBRARY(kperf SHARED ${PMU_SRC} ${UTIL_SRC} ${PFM_SRC} ${PMU_DECODER_SRC}) -ADD_LIBRARY(kperf_static STATIC ${PMU_SRC} ${UTIL_SRC} ${PFM_SRC} ${PMU_DECODER_SRC}) +if (BPF) + message(STATUS "BPF is true. building with bpf submodule") + add_compile_definitions(BPF_ENABLED) + include_directories(${PMU_FILE_DIR}/bpf) + include_directories(${PROJECT_TOP_DIR}/third_party/libbpf) + link_directories(${PROJECT_TOP_DIR}/third_party/local/bpf) + set(BPF_SOURCES ${BPF_SRC}) +else () + set(BPF_SOURCES "") +endif() +ADD_LIBRARY(kperf SHARED ${PMU_SRC} ${UTIL_SRC} ${PFM_SRC} ${PMU_DECODER_SRC} ${BPF_SOURCES}) +ADD_LIBRARY(kperf_static STATIC ${PMU_SRC} ${UTIL_SRC} ${PFM_SRC} ${PMU_DECODER_SRC} ${BPF_SOURCES}) set_target_properties(kperf_static PROPERTIES OUTPUT_NAME "kperf") target_link_libraries(kperf sym) target_compile_options(kperf PRIVATE -fPIC) diff --git a/pmu/bpf/sched_cgroup.bpf.c b/pmu/bpf/sched_cgroup.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..60321a7bea716bae6b39b1140d0ff37e9e14d224 --- /dev/null +++ b/pmu/bpf/sched_cgroup.bpf.c @@ -0,0 +1,176 @@ +/****************************************************************************** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * libkperf licensed under the Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + * PURPOSE. + * See the Mulan PSL v2 for more details. + * Author: Wu + * Create: 2025-08-10 + * Description: the bpf program for cgroup collecting in counting mode + ******************************************************************************/ +#include +#include +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary +#define MAX_EVENTS 128 // max events per cgroup: arbitrary +#define MAX_ENTRIES 102400 + +// single set of global perf events to measure +// {evt0, cpu0}, {evt0, cpu1}, {evt0, cpu2}...{evt0, cpuM}, {evt1, cpu0}...{evtM, cpuM} +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +// from cgroup id to event index +// key: cgroup id from OS +// value: internal id from 0...M +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, MAX_ENTRIES); +} cgrp_idx SEC(".maps"); + +// per-cpu event snapshots to calculate delta +// {evt0}, {evt1}...{evtM} +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} prev_readings SEC(".maps"); + +// aggregated event values for each cgroup (per-cpu) +// will be read from the user-space +// {cgrp0, evt0, cpu0}, {cgrp0, evt0, cpu1}...{cgrp0, evt0, cpuM}, {cgrp0, evt1, cpu0}...{cgrp0, evtM, cpuM}...{cgrpM, evtM, cpuM} +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} cgrp_readings SEC(".maps"); + +const volatile __u32 num_events = 1; +const volatile __u32 num_cpus = 1; + +static inline int get_cgroup_idx(__u32 *cgrps, int size) +{ + struct task_struct *p = (void *)bpf_get_current_task(); + struct cgroup *cgrp; + register int i = 0; + __u32 *elem; + int level; + int cnt; + + cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup); + level = BPF_CORE_READ(cgrp, level); + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id; + + if (i > level) { + break; + } + + // convert cgroup-id to a map index + if (bpf_core_field_exists(cgrp->ancestor_ids)) { + cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); + } else { + bpf_printk("cannot get ancestor_ids, this field not in struct cgroup"); + return 0; + } + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) { + continue; + } + + cgrps[cnt++] = *elem; + if (cnt == size) { + break; + } + } + + return cnt; +} + +static int bperf_cgroup_count(void) +{ + register __u32 idx = 0; // to have it in a register to pass BPF verifier + register int c = 0; + struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; + __u32 cpu = bpf_get_smp_processor_id(); + __u32 cgrp_idx[MAX_LEVELS]; + int cgrp_cnt; + __u32 key, cgrp; + long err; + + cgrp_cnt = get_cgroup_idx(cgrp_idx, MAX_LEVELS); + + for (; idx < MAX_EVENTS; idx++) { + if (idx == num_events) + break; + + // XXX: do not pass idx directly (for verifier) + key = idx; + // this is per-cpu array for diff + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) { + val.counter = val.enabled = val.running = 0; + bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); + + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) { + return 0; + } + } + + // read from global perf_event array + key = idx * num_cpus + cpu; + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) { + bpf_printk("bpf_perf_event_read_value failed, continue"); + continue; + } + + delta.counter = val.counter - prev_val->counter; + delta.enabled = val.enabled - prev_val->enabled; + delta.running = val.running - prev_val->running; + bpf_printk("prev_val : %ld val : %ld delta : %ld \n", prev_val->counter, val.counter, delta.counter); + + for (c = 0; c < MAX_LEVELS; c++) { + if (c == cgrp_cnt) + break; + cgrp = cgrp_idx[c]; + + // aggregate the result by cgroup + key = cgrp * num_events + idx; + cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); + if (cgrp_val) { + cgrp_val->counter += delta.counter; + cgrp_val->enabled += delta.enabled; + cgrp_val->running += delta.running; + bpf_printk("cgrp_val : %ld\n", cgrp_val->counter); + } else { + bpf_map_update_elem(&cgrp_readings, &key, &delta, BPF_ANY); + } + } + + *prev_val = val; + } + return 0; +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(trigger_read) +{ + return bperf_cgroup_count(); +} diff --git a/pmu/bpf/sched_counter.bpf.c b/pmu/bpf/sched_counter.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..26e9ed1861f8751b84a50bc0b2d725e2bd0569d9 --- /dev/null +++ b/pmu/bpf/sched_counter.bpf.c @@ -0,0 +1,120 @@ +/****************************************************************************** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * libkperf licensed under the Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * http://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR + * PURPOSE. + * See the Mulan PSL v2 for more details. + * Author: Wu + * Create: 2025-08-10 + * Description: the bpf program for ordinary or multi-thread program collecting in counting mode + ******************************************************************************/ +#include +#include +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +#define MAX_ENTRIES 102400 + +// system pmu count. key: pid, value : count of each core +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +// system pmu count at last time sched_switch was triggered +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} prev_readings SEC(".maps"); + +// accumulated pmu count of pid. key: accum_key, value: count of each core +// If a pid creates a child process/thread, they use the same accum key as this pid and their pmu events accumulated it +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1024); +} accum_readings SEC(".maps"); + +// check whether to record pmu value. key: pid, value: accum_key +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} filter SEC(".maps"); + +SEC("raw_tp/sched_switch") +int BPF_PROG(on_switch) { + __u32 pid; + __u32 zero=0; + __u32 *accum_key; + __u32 cpu = bpf_get_smp_processor_id(); + long err; + struct bpf_perf_event_value cur_val, *prev_val, *accum_val; + + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); + if (!prev_val) { + bpf_printk("failed to bpf_map_lookup_elem prev_readings.\n"); + return 0; + } + + // get pmu value by API of bpf + err = bpf_perf_event_read_value(&events, BPF_F_CURRENT_CPU, &cur_val, sizeof(struct bpf_perf_event_value)); + if (err) { + bpf_printk("failed to bpf_event_read_value: %d cpu %d\n", err, cpu); + return 0; + } + pid = bpf_get_current_pid_tgid() & 0xffffffff; + accum_key = bpf_map_lookup_elem(&filter, &pid); + if (!accum_key) { + return 0; + } + + accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); + if (!accum_val) { + *prev_val = cur_val; + return 0; + } + + accum_val->counter += cur_val.counter - prev_val->counter; + accum_val->enabled += cur_val.enabled - prev_val->enabled; + accum_val->running += cur_val.running - prev_val->running; + bpf_printk("cur_val counting: %ld prev_val counting: %ld accum_val counting: %ld\n", + cur_val.counter, prev_val->counter, accum_val->counter); + + *prev_val = cur_val; + return 0; +} + +SEC("tp_btf/task_newtask") +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags){ + long err; + __u32 new_pid; + __u32 parent_pid; + __u32 *accum_key; + struct bpf_perf_event_value *accum_val; + + parent_pid = bpf_get_current_pid_tgid() & 0xffffffff; + new_pid = task->pid; + + accum_key = bpf_map_lookup_elem(&filter, &parent_pid); + if (!accum_key) { + return 0; + } + + bpf_map_update_elem(&filter, &new_pid, accum_key, BPF_NOEXIST); + bpf_printk("new pid: %d parent: %d add child: %ld accum_key: %ld\n", new_pid, parent_pid, new_pid, *accum_key); + return 0; +} \ No newline at end of file diff --git a/test/test_perf/CMakeLists.txt b/test/test_perf/CMakeLists.txt index 1eb6f63b1e894fb73313f481947747b4b0091ef2..0978ab6990c95becee2916628cdffd62f5c2e2af 100644 --- a/test/test_perf/CMakeLists.txt +++ b/test/test_perf/CMakeLists.txt @@ -4,6 +4,9 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/../../pmu) include_directories(${CMAKE_CURRENT_LIST_DIR}/../../pmu/pfm) include_directories(${CMAKE_CURRENT_LIST_DIR}/../../pmu/analyzer/metric) include_directories(${CMAKE_CURRENT_LIST_DIR}/../../pmu/decoder) +if (BPF) + include_directories(${CMAKE_CURRENT_LIST_DIR}/../../pmu/bpf) +endif() include_directories(${PROJECT_TOP_DIR}/include) add_compile_options(-g) set(CMAKE_CXX_STANDARD 14) diff --git a/third_party/libbpf b/third_party/libbpf new file mode 160000 index 0000000000000000000000000000000000000000..da08818f4f3b6a8f6d15617184de9a6c34c5b642 --- /dev/null +++ b/third_party/libbpf @@ -0,0 +1 @@ +Subproject commit da08818f4f3b6a8f6d15617184de9a6c34c5b642