From 5256ffd0a051e11646a38debfd8bb68cfb738305 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Wed, 23 Feb 2022 16:19:20 +0800 Subject: [PATCH 1/2] monitor/sched: add runqlen to summarize run queue length Summarize scheduler run queue length as a histogram. This is an upgrade to monitor/sched/runlat/runqlen based on ebpf. Signed-off-by: Hailong Liu --- source/tools/monitor/sched/runqlen/Makefile | 8 + .../monitor/sched/runqlen/bpf/runqlen.bpf.c | 108 +++++++ source/tools/monitor/sched/runqlen/runqlen.c | 305 ++++++++++++++++++ source/tools/monitor/sched/runqlen/runqlen.h | 12 + .../monitor/sched/runqlen/trace_helpers.c | 73 +++++ .../monitor/sched/runqlen/trace_helpers.h | 97 ++++++ .../monitor/sched/runqlen/uprobe_helpers.h | 18 ++ 7 files changed, 621 insertions(+) create mode 100644 source/tools/monitor/sched/runqlen/Makefile create mode 100644 source/tools/monitor/sched/runqlen/bpf/runqlen.bpf.c create mode 100644 source/tools/monitor/sched/runqlen/runqlen.c create mode 100644 source/tools/monitor/sched/runqlen/runqlen.h create mode 100644 source/tools/monitor/sched/runqlen/trace_helpers.c create mode 100644 source/tools/monitor/sched/runqlen/trace_helpers.h create mode 100644 source/tools/monitor/sched/runqlen/uprobe_helpers.h diff --git a/source/tools/monitor/sched/runqlen/Makefile b/source/tools/monitor/sched/runqlen/Makefile new file mode 100644 index 00000000..ceeea2e5 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/Makefile @@ -0,0 +1,8 @@ + +newdirs := $(shell find ./ -type d) + +bpfsrcs := $(wildcard bpf/*.bpf.c) +csrcs := $(wildcard *.c) +target := runqlen + +include $(SRC)/mk/bpf.mk diff --git a/source/tools/monitor/sched/runqlen/bpf/runqlen.bpf.c b/source/tools/monitor/sched/runqlen/bpf/runqlen.bpf.c new file mode 100644 index 00000000..c282200f --- /dev/null +++ b/source/tools/monitor/sched/runqlen/bpf/runqlen.bpf.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Wenbo Zhang +#include +#include +#include +#include +#include "../runqlen.h" +//#define MAX_CPU_NR 128 +//#define MAX_SLOTS 32 + +//const volatile bool targ_per_cpu = false; + +struct bpf_map_def SEC("maps") args_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(bool), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") hist_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct hist), + .max_entries = MAX_CPU_NR, +}; + +#if 0 +/* + * Todo: + * task->se.cfs_rq->nr_running is not the perfect soluthon for child cpu-cgroup + */ +static u64 get_runq_nr_run(struct task_struct *task) +{ + int limit; + u64 nr_running = 0; + struct sched_entity *se, *topse; + + limit = BPF_CORE_READ(task, se.depth); + topse = BPF_CORE_READ(task, se.parent); + if (topse) { + for (se = topse; se && limit > 1; limit--) { + topse = se; + se = BPF_CORE_READ(se, parent); + } + nr_running = BPF_CORE_READ(topse, cfs_rq, nr_running); + } else { + nr_running = BPF_CORE_READ(task, se.cfs_rq, nr_running); + } + + return nr_running; +} +#else +static u64 get_runq_nr_run(struct task_struct *task) +{ + u64 nr_running = BPF_CORE_READ(task, se.cfs_rq, nr_running); + + return nr_running; +} +#endif + +SEC("perf_event") +int do_sample(struct bpf_perf_event_data *ctx) +{ + struct task_struct *task; + struct sched_entity *sep, *parent, *topse; + struct hist hist, *histp; + u64 slot, cpu = 0; + bool *targ_per_cpu_p , targ_per_cpu = false; + int arg_idx = 0; + + task = (void*)bpf_get_current_task(); + + slot = get_runq_nr_run(task); + /* + * Calculate run queue length by subtracting the currently running task, + * if present. len 0 == idle, len 1 == one running task. + */ + if (slot > 0) + slot--; + + targ_per_cpu_p = bpf_map_lookup_elem(&args_map, &arg_idx); + if (targ_per_cpu_p) + targ_per_cpu = *targ_per_cpu_p; + if (targ_per_cpu) { + cpu = bpf_get_smp_processor_id(); + /* + * When the program is started, the user space will immediately + * exit when it detects this situation, here just to pass the + * verifier's check. + */ + if (cpu >= MAX_CPU_NR) + return 0; + } + histp = bpf_map_lookup_elem(&hist_map, &cpu); + if (histp) { + if (slot >= MAX_SLOTS) + slot = MAX_SLOTS - 1; + if (targ_per_cpu) + histp->slots[slot]++; + else + __sync_fetch_and_add(&histp->slots[slot], 1); + bpf_map_update_elem(&hist_map, &cpu, histp, BPF_ANY); + } else + return -1; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/source/tools/monitor/sched/runqlen/runqlen.c b/source/tools/monitor/sched/runqlen/runqlen.c new file mode 100644 index 00000000..fafa81b9 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/runqlen.c @@ -0,0 +1,305 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2020 Wenbo Zhang +// +// Based on runqlen(8) from BCC by Brendan Gregg. +// 11-Sep-2020 Wenbo Zhang Created this. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "runqlen.h" +#include "bpf/runqlen.skel.h" +#include "trace_helpers.h" + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +struct env { + bool per_cpu; + bool runqocc; + bool timestamp; + time_t interval; + int freq; + int times; + bool verbose; +} env = { + .interval = 99999999, + .times = 99999999, + .freq = 99, +}; + +static volatile bool exiting; + +const char *argp_program_version = "runqlen 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; +const char argp_program_doc[] = +"Summarize scheduler run queue length as a histogram.\n" +"\n" +"USAGE: runqlen [--help] [-C] [-O] [-T] [-f FREQUENCY] [interval] [count]\n" +"\n" +"EXAMPLES:\n" +" runqlen # summarize run queue length as a histogram\n" +" runqlen 1 10 # print 1 second summaries, 10 times\n" +" runqlen -T 1 # 1s summaries and timestamps\n" +" runqlen -O # report run queue occupancy\n" +" runqlen -C # show each CPU separately\n" +" runqlen -f 199 # sample at 199HZ\n"; + +static const struct argp_option opts[] = { + { "cpus", 'C', NULL, 0, "Print output for each CPU separately" }, + { "frequency", 'f', "FREQUENCY", 0, "Sample with a certain frequency" }, + { "runqocc", 'O', NULL, 0, "Report run queue occupancy" }, + { "timestamp", 'T', NULL, 0, "Include timestamp on output" }, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'C': + env.per_cpu = true; + break; + case 'O': + env.runqocc = true; + break; + case 'T': + env.timestamp = true; + break; + case 'f': + errno = 0; + env.freq = strtol(arg, NULL, 10); + if (errno || env.freq <= 0) { + fprintf(stderr, "Invalid freq (in hz): %s\n", arg); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + errno = 0; + if (pos_args == 0) { + env.interval = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid internal\n"); + argp_usage(state); + } + } else if (pos_args == 1) { + env.times = strtol(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid times\n"); + argp_usage(state); + } + } else { + fprintf(stderr, + "unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + pos_args++; + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static int nr_cpus; + +static int open_and_attach_perf_event(int freq, struct bpf_program *prog, + struct bpf_link *links[]) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = 1, + .sample_period = freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + int i, fd; + + for (i = 0; i < nr_cpus; i++) { + fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (fd < 0) { + /* Ignore CPU that is offline */ + if (errno == ENODEV) + continue; + fprintf(stderr, "failed to init perf sampling: %s\n", + strerror(errno)); + return -1; + } + links[i] = bpf_program__attach_perf_event(prog, fd); + if (!links[i]) { + fprintf(stderr, "failed to attach perf event on cpu: %d\n", i); + close(fd); + return -1; + } + } + + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sig_handler(int sig) +{ + exiting = true; +} + +static struct hist zero; + +static void print_runq_occupancy(int hist_fd) +{ + struct hist hist; + int slot, i = 0; + float runqocc; + + do { + __u64 samples, idle = 0, queued = 0, total = 0, meanlen = 0; + bpf_map_lookup_elem(hist_fd, &i, &hist); + bpf_map_update_elem(hist_fd, &i, &zero, BPF_ANY); + for (slot = 0; slot < MAX_SLOTS; slot++) { + __u64 val = hist.slots[slot]; + + if (slot == 0) + idle += val; + else { + total += val * slot; + queued += val; + } + } + samples = idle + queued; + runqocc = queued * 1.0 / max(1ULL, samples); + meanlen = total / max(1ULL, samples); + if (env.per_cpu) + printf("runqocc, CPU %-3d %6.2f%%, AVGlen %lld\n", i, + 100 * runqocc, meanlen); + else + printf("runqocc: %0.2f%%, AVGlen %lld\n", 100 * runqocc, meanlen); + } while (env.per_cpu && ++i < nr_cpus); +} + +static void print_linear_hists(int hist_fd) +{ + struct hist hist; + int i = 0; + + do { + bpf_map_lookup_elem(hist_fd, &i, &hist); + bpf_map_update_elem(hist_fd, &i, &zero, BPF_ANY); + //hist = bss->hists[i]; + //bss->hists[i] = zero; + if (env.per_cpu) + printf("cpu = %d\n", i); + print_linear_hist(hist.slots, MAX_SLOTS, 0, 1, "runqlen"); + } while (env.per_cpu && ++i < nr_cpus); +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + struct bpf_link *links[MAX_CPU_NR] = {}; + struct runqlen_bpf *obj; + struct tm *tm; + char ts[32]; + int err, i, hist_fd, arg_fd, arg_idx = 0; + time_t t; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + //libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + printf("failed to get # of possible cpus: '%s'!\n", + strerror(-nr_cpus)); + return 1; + } + if (nr_cpus > MAX_CPU_NR) { + fprintf(stderr, "the number of cpu cores is too big, please " + "increase MAX_CPU_NR's value and recompile"); + return 1; + } + + obj = runqlen_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + return 1; + } + + /* initialize global data (filtering options) */ + //obj->rodata->targ_per_cpu = env.per_cpu; + + err = runqlen_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF object: %d\n", err); + goto cleanup; + } + + arg_fd = bpf_map__fd(obj->maps.args_map); + bpf_map_update_elem(arg_fd, &arg_idx, &env.per_cpu, BPF_ANY); + + err = open_and_attach_perf_event(env.freq, obj->progs.do_sample, links); + if (err) + goto cleanup; + + printf("Sampling run queue length... Hit Ctrl-C to end.\n"); + + signal(SIGINT, sig_handler); + + hist_fd = bpf_map__fd(obj->maps.hist_map); + while (1) { + sleep(env.interval); + printf("\n"); + + if (env.timestamp) { + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-8s\n", ts); + } + + if (env.runqocc) + print_runq_occupancy(hist_fd); + else + print_linear_hists(hist_fd); + + if (exiting || --env.times == 0) + break; + } + +cleanup: + for (i = 0; i < nr_cpus; i++) + bpf_link__destroy(links[i]); + runqlen_bpf__destroy(obj); + + return err != 0; +} diff --git a/source/tools/monitor/sched/runqlen/runqlen.h b/source/tools/monitor/sched/runqlen/runqlen.h new file mode 100644 index 00000000..60527289 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/runqlen.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __RUNQLEN_H +#define __RUNQLEN_H + +#define MAX_CPU_NR 128 +#define MAX_SLOTS 32 + +struct hist { + __u32 slots[MAX_SLOTS]; +}; + +#endif /* __RUNQLEN_H */ diff --git a/source/tools/monitor/sched/runqlen/trace_helpers.c b/source/tools/monitor/sched/runqlen/trace_helpers.c new file mode 100644 index 00000000..6dc755f0 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/trace_helpers.c @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +// Copyright (c) 2020 Wenbo Zhang +// +// Based on ksyms improvements from Andrii Nakryiko, add more helpers. +// 28-Feb-2020 Wenbo Zhang Created this. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_helpers.h" +//#include "uprobe_helpers.h" +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +static void print_stars(unsigned int val, unsigned int val_max, int width) +{ + int num_stars, num_spaces, i; + bool need_plus; + + num_stars = min(val, val_max) * width / val_max; + num_spaces = width - num_stars; + need_plus = val > val_max; + + for (i = 0; i < num_stars; i++) + printf("*"); + for (i = 0; i < num_spaces; i++) + printf(" "); + if (need_plus) + printf("+"); +} + +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type) +{ + int i, stars_max = 40, idx_min = -1, idx_max = -1; + unsigned int val, val_max = 0; + + for (i = 0; i < vals_size; i++) { + val = vals[i]; + if (val > 0) { + idx_max = i; + if (idx_min < 0) + idx_min = i; + } + if (val > val_max) + val_max = val; + } + + if (idx_max < 0) + return; + + printf(" %-13s : count distribution\n", val_type); + for (i = idx_min; i <= idx_max; i++) { + val = vals[i]; + printf(" %-10d : %-8d |", base + i * step, val); + print_stars(val, val_max, stars_max); + printf("|\n"); + } +} diff --git a/source/tools/monitor/sched/runqlen/trace_helpers.h b/source/tools/monitor/sched/runqlen/trace_helpers.h new file mode 100644 index 00000000..98fd640f --- /dev/null +++ b/source/tools/monitor/sched/runqlen/trace_helpers.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __TRACE_HELPERS_H +#define __TRACE_HELPERS_H + +#include + +#define NSEC_PER_SEC 1000000000ULL + +struct ksym { + const char *name; + unsigned long addr; +}; + +struct ksyms; + +struct ksyms *ksyms__load(void); +void ksyms__free(struct ksyms *ksyms); +const struct ksym *ksyms__map_addr(const struct ksyms *ksyms, + unsigned long addr); +const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms, + const char *name); + +struct sym { + const char *name; + unsigned long start; + unsigned long size; +}; + +struct syms; + +struct syms *syms__load_pid(int tgid); +struct syms *syms__load_file(const char *fname); +void syms__free(struct syms *syms); +const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr); + +struct syms_cache; + +struct syms_cache *syms_cache__new(int nr); +struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid); +void syms_cache__free(struct syms_cache *syms_cache); + +struct partition { + char *name; + unsigned int dev; +}; + +struct partitions; + +struct partitions *partitions__load(void); +void partitions__free(struct partitions *partitions); +const struct partition * +partitions__get_by_dev(const struct partitions *partitions, unsigned int dev); +const struct partition * +partitions__get_by_name(const struct partitions *partitions, const char *name); + +void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type); +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type); + +unsigned long long get_ktime_ns(void); + +bool is_kernel_module(const char *name); + +/* + * When attempting to use kprobe/kretprobe, please check out new fentry/fexit + * probes, as they provide better performance and usability. But in some + * situations we have to fallback to kprobe/kretprobe probes. This helper + * is used to detect fentry/fexit support for the specified kernel function. + * + * 1. A gap between kernel versions, kernel BTF is exposed + * starting from 5.4 kernel. but fentry/fexit is actually + * supported starting from 5.5. + * 2. Whether kernel supports module BTF or not + * + * *name* is the name of a kernel function to be attached to, which can be + * from vmlinux or a kernel module. + * *mod* is a hint that indicates the *name* may reside in module BTF, + * if NULL, it means *name* belongs to vmlinux. + */ +bool fentry_exists(const char *name, const char *mod); + +/* + * The name of a kernel function to be attached to may be changed between + * kernel releases. This helper is used to confirm whether the target kernel + * uses a certain function name before attaching. + * + * It is achieved by scaning + * /sys/kernel/debug/tracing/available_filter_functions + * If this file does not exist, it fallbacks to parse /proc/kallsyms, + * which is slower. + */ +bool kprobe_exists(const char *name); + +bool vmlinux_btf_exists(void); +bool module_btf_exists(const char *mod); + +#endif /* __TRACE_HELPERS_H */ diff --git a/source/tools/monitor/sched/runqlen/uprobe_helpers.h b/source/tools/monitor/sched/runqlen/uprobe_helpers.h new file mode 100644 index 00000000..47f77bb2 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/uprobe_helpers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Google LLC. */ +#ifndef __UPROBE_HELPERS_H +#define __UPROBE_HELPERS_H + +#include +#include +#include + +int get_pid_binary_path(pid_t pid, char *path, size_t path_sz); +int get_pid_lib_path(pid_t pid, const char *lib, char *path, size_t path_sz); +int resolve_binary_path(const char *binary, pid_t pid, char *path, size_t path_sz); +off_t get_elf_func_offset(const char *path, const char *func); +Elf *open_elf(const char *path, int *fd_close); +Elf *open_elf_by_fd(int fd); +void close_elf(Elf *e, int fd_close); + +#endif /* __UPROBE_HELPERS_H */ -- Gitee From fd07a2cf49c6a1441376b97b2f868fe68663d607 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Wed, 23 Feb 2022 16:37:50 +0800 Subject: [PATCH 2/2] sched/runqlen: Add README doc Add user manual for runqlen. Signed-off-by: Hailong Liu --- source/tools/monitor/sched/runqlen/README.txt | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 source/tools/monitor/sched/runqlen/README.txt diff --git a/source/tools/monitor/sched/runqlen/README.txt b/source/tools/monitor/sched/runqlen/README.txt new file mode 100644 index 00000000..c1f84e68 --- /dev/null +++ b/source/tools/monitor/sched/runqlen/README.txt @@ -0,0 +1,55 @@ +1 What is runqlen? +Summarize scheduler run queue length as a histogram. + + +2 Usage of runqlen +USAGE: runqlen [--help] [-C] [-O] [-T] [-f FREQUENCY] [interval] [count] + +EXAMPLES: + runqlen # summarize run queue length as a histogram + runqlen 1 10 # print 1 second summaries, 10 times + runqlen -T 1 # 1s summaries and timestamps + runqlen -O # report run queue occupancy + runqlen -C # show each CPU separately + runqlen -f 199 # sample at 199HZ + + -C, --cpus Print output for each CPU separately + -f, --frequency=FREQUENCY Sample with a certain frequency + -O, --runqocc Report run queue occupancy + -T, --timestamp Include timestamp on output + -v, --verbose Verbose debug output + -?, --help Give this help list + --usage Give a short usage message + -V, --version Print program version + +3 Example +3.1 perf the run queue occupancy 2 times every 5 seconds +sudo sysak runqlen 5 2 -C -T -O + +16:06:54 +runqocc, CPU 0 0.00%, AVGlen 0 +runqocc, CPU 1 0.00%, AVGlen 0 +runqocc, CPU 2 0.00%, AVGlen 0 +runqocc, CPU 3 0.00%, AVGlen 0 + +16:06:55 +runqocc, CPU 0 0.00%, AVGlen 0 +runqocc, CPU 1 0.00%, AVGlen 0 +runqocc, CPU 2 0.00%, AVGlen 0 +runqocc, CPU 3 0.00%, AVGlen 0 + +3.2 perf the run queue histogram 1 times every 5 seconds +sudo sysak runqlen 5 2 -C -T -O +16:34:56 +cpu = 0 + runqlen : count distribution + 0 : 495 |****************************************| +cpu = 1 + runqlen : count distribution + 0 : 495 |****************************************| +cpu = 2 + runqlen : count distribution + 0 : 495 |****************************************| +cpu = 3 + runqlen : count distribution + 0 : 495 |****************************************| -- Gitee