diff --git a/source/tools/detect/syscall_slow/Makefile b/source/tools/detect/syscall_slow/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..53c8c392b72117c5149853f24d55bd796adeb24a --- /dev/null +++ b/source/tools/detect/syscall_slow/Makefile @@ -0,0 +1,8 @@ + +newdirs := $(shell find ./ -type d) + +bpfsrcs := $(wildcard bpf/*.bpf.c) +csrcs := $(wildcard *.c) +target := syscall_slow + +include $(SRC)/mk/bpf.mk diff --git a/source/tools/detect/syscall_slow/bpf/syscall_slow.bpf.c b/source/tools/detect/syscall_slow/bpf/syscall_slow.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..9290e2568f507de7823c7c5e91468e48f7864703 --- /dev/null +++ b/source/tools/detect/syscall_slow/bpf/syscall_slow.bpf.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "../syscall_slow.h" + +#define TASK_RUNNING 0 +#define PERF_MAX_STACK_DEPTH 127 +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) +#define _(P) ({ \ + typeof(P) val; \ + __builtin_memset(&val, 0, sizeof(val)); \ + bpf_probe_read(&val, sizeof(val), &P); \ + val; \ +}) + +#define name_n_equal(src, dst, dstn, MAXn) ({ \ + bool ret = true; \ + int i; \ + for (int i = 0; i < MAXn; i++) { \ + if (i >= dstn) \ + break; \ + if (src[i] != dst[i]) { \ + ret = false; \ + break;} \ + } \ + ret; \ +}) + +/* + * struct trace_event_raw_sys_enter/exit may not defined in + * some old versions, so we do a work-around + * */ +struct raw_sys_enter_arg { + struct trace_entry ent; + long int id; + long unsigned int args[6]; + char __data[0]; +}; + +struct raw_sys_exit_arg { + struct trace_entry ent; + long int id; + long int ret; + char __data[0]; +}; + +struct sched_sched_switch_args { + struct trace_entry ent; + char prev_comm[16]; + pid_t prev_pid; + int prev_prio; + long int prev_state; + char next_comm[16]; + pid_t next_pid; + int next_prio; + char __data[0]; +}; + +struct enter_info { + u64 csw_begin, enter_begin; + long int prev_state; + u64 itime, vtime; + u64 stime, exec_time, nvcsw, nivcsw; + u32 ret; +}; + +struct bpf_map_def SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .key_size = sizeof(u32), + .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), + .max_entries = 10240, +}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, u32); + __type(value, struct arg_info); +} arg_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 10240); + __type(key, u32); + __type(value, struct enter_info); +} enter_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + + +SEC("tp/sched/sched_switch") +int handle__sched_switch(struct sched_sched_switch_args *ctx) +{ + u64 pid, prev_pid; + u64 now, itime, vtime, csw_begin; + struct enter_info *enterp; + + __builtin_memset(&itime, 0,sizeof(u64)); + __builtin_memset(&vtime, 0,sizeof(u64)); + __builtin_memset(&csw_begin, 0,sizeof(u64)); + __builtin_memset(&now, 0,sizeof(u64)); + prev_pid = ctx->prev_pid; + pid = ctx->next_pid; + enterp = bpf_map_lookup_elem(&enter_map, &prev_pid); + now = bpf_ktime_get_ns(); + if (enterp) { + enterp->csw_begin = now; + enterp->prev_state = ctx->prev_state; + enterp->ret = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); + } + + enterp = bpf_map_lookup_elem(&enter_map, &pid); + if (enterp) { + itime = _(enterp->itime); + vtime = _(enterp->vtime); + csw_begin = _(enterp->csw_begin); + if (enterp->prev_state == TASK_RUNNING) { + enterp->itime = itime + (now - csw_begin); + } else { + enterp->vtime = vtime + (now - csw_begin); + } + } + + return 0; +} + +SEC("tp/raw_syscalls/sys_enter") +int handle_raw_sys_enter(struct raw_sys_enter_arg *ctx) +{ + u64 now; + int pid, i = 0; + bool match; + char comm[16] = {0}; + struct task_struct *task; + struct arg_info *argp; + struct enter_info *enterp, enter; + struct filter filter; + + __builtin_memset(&filter, 0, sizeof(filter)); + bpf_get_current_comm(&comm, sizeof(comm)); + argp = bpf_map_lookup_elem(&arg_map, &i); + if (argp) + filter = _(argp->filter); + else + return 0; + if (filter.size && filter.size != -1) { + unsigned long size = filter.size; + match = name_n_equal(comm, filter.comm, size, 16); + if (!match) + return 0; + } else if (filter.size == -1) { + ;/*todo*/ + } else { + return 0; /* size==0, skip */ + } + pid = bpf_get_current_pid_tgid(); + task = (void *)bpf_get_current_task(); + enterp = bpf_map_lookup_elem(&enter_map, &pid); + if (!enterp) { + __builtin_memset(&enter, 0, sizeof(enter)); + now = bpf_ktime_get_ns(); + enter.enter_begin = now; + enter.exec_time = BPF_CORE_READ(task, se.sum_exec_runtime); + enter.stime = _(task->stime); + enter.nvcsw = _(task->nvcsw); + enter.nivcsw = _(task->nivcsw); + enter.itime = 0; + enter.vtime = 0; + bpf_map_update_elem(&enter_map, &pid, &enter, 0); + } + return 0; +} + +SEC("tp/raw_syscalls/sys_exit") +int handle_raw_sys_exit(struct raw_sys_exit_arg *ctx) +{ + int pid, i = 0; + bool match; + char comm[16] = {0}; + struct arg_info *argp; + struct task_struct *task; + struct enter_info *enterp; + struct filter filter; + struct event event = {}; + + __builtin_memset(&filter, 0, sizeof(filter)); + bpf_get_current_comm(&comm, sizeof(comm)); + argp = bpf_map_lookup_elem(&arg_map, &i); + if (argp) + filter = _(argp->filter); + else + return 0; + if (filter.size && filter.size != -1) { + unsigned long size = filter.size; + match = name_n_equal(comm, filter.comm, size, 16); + if (!match) + return 0; + } else if (filter.size == -1) { + ;/* todo: shell we monitor all threads? */ + } else { + return 0; /* size==0, we skip because its too much... */ + } + + pid = bpf_get_current_pid_tgid(); + task = (void *)bpf_get_current_task(); + enterp = bpf_map_lookup_elem(&enter_map, &pid); + if (enterp) { + u64 delta, now, exec_time; + now = bpf_ktime_get_ns(); + delta = now - enterp->enter_begin; + if (delta > argp->thresh) { + exec_time = BPF_CORE_READ(task, se.sum_exec_runtime); + event.realtime = exec_time - enterp->exec_time; + event.stime = _(task->stime); + event.stime -= enterp->stime; + event.nvcsw = _(task->nvcsw); + event.nvcsw -= enterp->nvcsw; + event.nivcsw = _(task->nivcsw); + event.nivcsw -= enterp->nivcsw; + event.itime = enterp->itime; + event.vtime = enterp->vtime; + event.delay = delta; + event.sysid = ctx->id; + event.ret = enterp->ret; + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); + } + bpf_map_delete_elem(&enter_map, &pid); + } + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/source/tools/detect/syscall_slow/stacktrace.c b/source/tools/detect/syscall_slow/stacktrace.c new file mode 100644 index 0000000000000000000000000000000000000000..7658b6d7593bdabd25e297e57c1ecba95cecd842 --- /dev/null +++ b/source/tools/detect/syscall_slow/stacktrace.c @@ -0,0 +1,122 @@ +#include +#include +#include +//#include +//#include +//#include +#include +#include +//#include +#include +#include +#include +#include +#include "syscall_slow.h" + +#define MAX_SYMS 300000 +#define PERF_MAX_STACK_DEPTH 127 + +static int sym_cnt; +extern FILE *filep; + +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +int load_kallsyms(struct ksym **pksyms) +{ + struct ksym *syms; + FILE *f = fopen("/proc/kallsyms", "r"); + char func[256], buf[256]; + char symbol; + void *addr; + int i = 0; + + if (!f) + return -ENOENT; + + syms = malloc(MAX_SYMS * sizeof(struct ksym)); + if (!syms) { + fclose(f); + return -ENOMEM; + } + + while (!feof(f)) { + if (!fgets(buf, sizeof(buf), f)) + break; + if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) + break; + if (!addr) + continue; + syms[i].addr = (long) addr; + syms[i].name = strdup(func); + i++; + if (i > MAX_SYMS) { + printf("Warning: no space on ksym array!\n"); + break; + } + } + fclose(f); + sym_cnt = i; + qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); + *pksyms = syms; + return 0; +} + +struct ksym *ksym_search(long key, struct ksym *syms) +{ + int start = 0, end = sym_cnt; + int result; + + /* kallsyms not loaded. return NULL */ + if (sym_cnt <= 0) + return NULL; + + while (start < end) { + size_t mid = start + (end - start) / 2; + + result = key - syms[mid].addr; + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return &syms[mid]; + } + + if (start >= 1 && syms[start - 1].addr < key && + key < syms[start].addr) + /* valid ksym */ + return &syms[start - 1]; + + /* out of range. return _stext */ + return &syms[0]; +} + +static void print_ksym(__u64 addr, struct ksym *psym) +{ + struct ksym *sym; + + if (!addr) + return; + + sym = ksym_search(addr, psym); + fprintf(filep, "<0x%llx> %s\n", addr, sym->name); +} + +void print_stack(int fd, __u32 ret, struct ksym *syms) +{ + int i; + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + + if (bpf_map_lookup_elem(fd, &ret, &ip) == 0) { + for (i = 0; i < PERF_MAX_STACK_DEPTH - 1; i++) + print_ksym(ip[i], syms); + bpf_map_delete_elem(fd, &ret); + } else { + if ((int)(ret) < 0) + fprintf(filep, "<0x0000000000000000>:error=%d\n", (int)(ret)); + } +} + diff --git a/source/tools/detect/syscall_slow/syscall_slow.c b/source/tools/detect/syscall_slow/syscall_slow.c new file mode 100644 index 0000000000000000000000000000000000000000..8a0e6df07414afd4de139f811bde17db903af801 --- /dev/null +++ b/source/tools/detect/syscall_slow/syscall_slow.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "syscall_slow.h" +#include "./bpf/syscall_slow.skel.h" + +struct env { + time_t duration; + bool verbose; + __u64 threshold; + struct filter filter; +} env = { + .duration = 0, + .threshold = 10*1000*1000, /* 10ms */ +}; + +FILE *filep = NULL; +static int stackmp_fd; +static struct ksym *ksyms; +char filename[256] = {0}; +char log_dir[] = "/var/log/sysak/syscall_slow"; +char defaultfile[] = "/var/log/sysak/syscall_slow/syscall_slow.log"; +volatile sig_atomic_t exiting = 0; + +void print_stack(int fd, __u32 ret, struct ksym *syms); +int load_kallsyms(struct ksym **pksyms); +const char *argp_program_version = "syscall_slow 0.1"; +const char argp_program_doc[] = +"Catch the syscall slow time more than threshold.\n" +"\n" +"USAGE: syscall_slow [--help] [-t THRESH(ms)] [-f LOGFILE] [duration(s)]\n" +"\n" +"EXAMPLES:\n" +" syscall_slow # run forever, and detect syscall_slow more than 10ms(default)\n" +" syscall_slow -t 15 # detect syscall_slow with threshold 15ms (default 10ms)\n" +" syscall_slow -c bash # trace aplication who's name is bash\n" +" syscall_slow -f a.log # record result to a.log (default to ~sysak/syscall_slow/syscall_slow.log)\n"; + +static const struct argp_option opts[] = { + { "threshold", 't', "THRESH", 0, "Threshold to detect, default 10ms"}, + { "comm", 'c', "COMM", 0, "Name of the application"}, + { "logfile", 'f', "LOGFILE", 0, "logfile for result"}, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + int ret = errno; + static int pos_args; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 't': + errno = 0; + __u64 thresh; + thresh = strtoull(arg, NULL, 10); + if (errno) { + fprintf(stderr, "invalid threshold\n"); + argp_usage(state); + break; + } + env.threshold = thresh * 1000*1000; + break; + case 'c': + env.filter.size = strlen(arg); + if (env.filter.size < 1) { + fprintf(stderr, "Invalid COMM: %s\n", arg); + argp_usage(state); + return -1; + } + + if (env.filter.size > TASK_COMM_LEN - 1) + env.filter.size = TASK_COMM_LEN - 1; + + strncpy(env.filter.comm, arg, env.filter.size); + break; + case 'f': + if (strlen(arg) < 2) + strncpy(filename, defaultfile, sizeof(filename)); + else + strncpy(filename, arg, sizeof(filename)); + filep = fopen(filename, "w+"); + if (!filep) { + ret = errno; + fprintf(stderr, "%s :fopen %s\n", + strerror(errno), filename); + return ret; + } + break; + case ARGP_KEY_ARG: + if (pos_args++) { + fprintf(stderr, + "unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + errno = 0; + env.duration = strtol(arg, NULL, 10); + if (errno) { + ret = errno; + fprintf(stderr, "invalid duration\n"); + argp_usage(state); + return ret; + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + if (!filep) { + filep = fopen(defaultfile, "w+"); + if (!filep) { + ret = errno; + fprintf(stderr, "%s :fopen %s\n", + strerror(errno), defaultfile); + return ret; + } + } + + return 0; +} + +static void bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { + fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); + exit(1); + } +} + +void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + const struct event *e = data; + struct tm *tm; + char ts[64]; + time_t t; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%F_%H:%M:%S", tm); + if (0) { + ; + } else { + fprintf(filep, "%-21s %-8lld %-6lld %-6lld %-6lld %-6lld %-6lld %-6lld %-6ld\n", + ts, e->delay/(1000), e->realtime/(1000), e->itime/(1000), e->vtime/(1000), + e->stime/1000, e->nvcsw, e->nivcsw, e->sysid); + fflush(filep); + print_stack(stackmp_fd, e->ret, ksyms); + } +} + +void syscall_slow_handler(int poll_fd, int map_fd) +{ + int arg_key = 0, err = 0; + struct arg_info arg_info = {}; + struct perf_buffer *pb = NULL; + struct perf_buffer_opts pb_opts = {}; + + fprintf(filep, "%-21s %-8s %-6s %-6s %-6s %-6s %-6s %-6s %-6s\n", + "TIME(syscall)", "DELAY", "REAL", "WAIT", "SLEEP", + "SYS", "vcsw", "ivcsw", "sysid"); + + pb_opts.sample_cb = handle_event; + pb = perf_buffer__new(poll_fd, 64, &pb_opts); + if (!pb) { + err = -errno; + fprintf(stderr, "failed to open perf buffer: %d\n", err); + goto clean_syscall_slow; + } + + arg_info.thresh = env.threshold; + arg_info.filter = env.filter; + err = bpf_map_update_elem(map_fd, &arg_key, &arg_info, 0); + if (err) { + fprintf(stderr, "Failed to update arg_map\n"); + goto clean_syscall_slow; + } + + while (!exiting) { + err = perf_buffer__poll(pb, 100); + if (err < 0 && err != -EINTR) { + fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); + goto clean_syscall_slow; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +clean_syscall_slow: + perf_buffer__free(pb); +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static int prepare_dictory(char *path) +{ + int ret; + + ret = mkdir(path, 0777); + if (ret < 0 && errno != EEXIST) + return errno; + else + return 0; +} + +static void sig_exit(int signo) +{ + exiting = 1; +} + +int main(int argc, char **argv) +{ + int err, ent_fd, arg_fd; + struct syscall_slow_bpf *obj; + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + + err = prepare_dictory(log_dir); + if (err) { + fprintf(stderr, "prepare_dictory %s fail\n", log_dir); + return err; + } + ksyms = NULL; + + memset(&env.filter, 0, sizeof(struct filter)); + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) { + fprintf(stderr, "argp_parse fail\n"); + return err; + } + libbpf_set_print(libbpf_print_fn); + + bump_memlock_rlimit(); + err = load_kallsyms(&ksyms); + if (err) { + fprintf(stderr, "Failed to load kallsyms\n"); + return err; + } + + obj = syscall_slow_bpf__open_and_load(); + if (!obj) { + fprintf(stderr, "failed to open and/or load BPF object\n"); + goto cleanup; + } + + err = syscall_slow_bpf__attach(obj); + if (err) { + fprintf(stderr, "failed to attach BPF programs\n"); + goto cleanup; + } + + arg_fd = bpf_map__fd(obj->maps.arg_map); + ent_fd = bpf_map__fd(obj->maps.events); + stackmp_fd = bpf_map__fd(obj->maps.stackmap); + + if (signal(SIGINT, sig_exit) == SIG_ERR || + signal(SIGALRM, sig_exit) == SIG_ERR) { + fprintf(stderr, "can't set signal handler: %s\n", strerror(errno)); + err = 1; + goto cleanup; + } + + if (env.duration) + alarm(env.duration); + + syscall_slow_handler(ent_fd, arg_fd); + +cleanup: + syscall_slow_bpf__destroy(obj); + if (ksyms) + free(ksyms); + + return err != 0; +} + diff --git a/source/tools/detect/syscall_slow/syscall_slow.h b/source/tools/detect/syscall_slow/syscall_slow.h new file mode 100644 index 0000000000000000000000000000000000000000..2cccc05488191a7c977ac91e95b08eb439c07fb5 --- /dev/null +++ b/source/tools/detect/syscall_slow/syscall_slow.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __IRQOFF_H +#define __IRQOFF_H + +#define TASK_COMM_LEN 16 +#define CPU_ARRY_LEN 4 + +struct info { + __u64 prev_counter; +}; + +struct tm_info { + __u64 last_stamp; +}; + +struct filter { + int size; + char comm[16]; +}; + +struct arg_info { + __u64 thresh; + struct filter filter; +}; + +struct event { + __u32 ret, pid; + long int sysid; + __u64 delay, stamp; + __u64 nvcsw, nivcsw, icnt, vcnt; + __u64 realtime, stime, itime, vtime; +}; + +struct ksym { + long addr; + char *name; +}; +#endif /* __LLCSTAT_H */ +