From c188a304268944898f61dc481e16295dfc1fcea3 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Mon, 11 Apr 2022 16:41:11 +0800 Subject: [PATCH 1/2] schedmoni: Add filter according to app's comm Signed-off-by: Hailong Liu --- .../sched/schedmoni/bpf/schedmoni.bpf.c | 87 ++++++++++++++----- .../tools/monitor/sched/schedmoni/runqslow.c | 2 +- .../tools/monitor/sched/schedmoni/schedmoni.c | 24 ++++- .../tools/monitor/sched/schedmoni/schedmoni.h | 6 ++ 4 files changed, 92 insertions(+), 27 deletions(-) diff --git a/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c b/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c index 8ae5bb09..198586c1 100644 --- a/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c +++ b/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c @@ -8,7 +8,7 @@ #define MAX_THRESH (10*1000) #define TASK_RUNNING 0 -#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define _(P) ({typeof(P) val; __builtin_memset(&val, 0, sizeof(val)); bpf_probe_read(&val, sizeof(val), &P); val;}) struct { __uint(type, BPF_MAP_TYPE_ARRAY); @@ -51,18 +51,15 @@ struct { __type(value, struct latinfo); } info_map SEC(".maps"); -/* - * the return value type can only be assigned to 0, - * so it can be int ,long , long long and the unsinged version - * */ #define GETARG_FROM_ARRYMAP(map,argp,type,member)({ \ - type retval = 0; \ - int i = 0; \ - argp = bpf_map_lookup_elem(&map, &i); \ - if (argp) { \ + int i = 0; \ + type retval; \ + __builtin_memset(&retval, 0, sizeof(type)); \ + argp = bpf_map_lookup_elem(&map, &i); \ + if (argp) { \ retval = _(argp->member); \ - } \ - retval; \ + } \ + retval; \ }) #define BPF_F_FAST_STACK_CMP (1ULL << 9) @@ -71,6 +68,20 @@ struct { #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITS_PER_LONG 64 +#define strequal(a, pcom) ({ \ + bool ret = true; \ + int i; \ + unsigned long size = pcom.size; \ + for (int i = 0; i < 16; i++) { \ + if (i > size) \ + break; \ + if (a[i] != pcom.comm[i]) { \ + ret = false; \ + break;} \ + } \ + ret; \ +}) + static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); @@ -103,23 +114,48 @@ static inline int test_tsk_need_resched(struct task_struct *tsk, int flag) } /* record enqueue timestamp */ -static __always_inline -int trace_enqueue(u32 tgid, u32 pid, unsigned int runqlen) +static int trace_enqueue(struct task_struct *p, unsigned int runqlen) { - struct enq_info enq_info; + bool comm_eqaul = false, use_comm = true; + char comm[16]; u64 ts; - pid_t targ_tgid, targ_pid; struct args *argp; + u32 tgid, pid; + struct enq_info enq_info; + pid_t targ_tgid, targ_pid; + + tgid = _(p->tgid); + pid = _(p->pid); if (!pid) return 0; - targ_tgid = GETARG_FROM_ARRYMAP(argmap, argp, pid_t, targ_tgid); - targ_pid = GETARG_FROM_ARRYMAP(argmap, argp, pid_t, targ_pid); - if (targ_tgid && targ_tgid != tgid) - return 0; - if (targ_pid && targ_pid != pid) - return 0; + { + int k = 0; + struct comm_item comm_i; + + argp = bpf_map_lookup_elem(&argmap, &k); + __builtin_memset(&comm_i, 0, sizeof(comm_i)); + if (argp) + comm_i = _(argp->comm_i); + bpf_probe_read(comm, sizeof(comm), &(p->comm)); + if (comm_i.size) { + comm_eqaul = strequal(comm, comm_i); + if (!comm_eqaul) + return 0; + } else + use_comm = false; + } + + if (!use_comm) { + targ_tgid = GETARG_FROM_ARRYMAP(argmap, argp, pid_t, targ_tgid); + targ_pid = GETARG_FROM_ARRYMAP(argmap, argp, pid_t, targ_pid); + if (targ_tgid && targ_tgid != tgid) + return 0; + + if (targ_pid && targ_pid != pid) + return 0; + } ts = bpf_ktime_get_ns(); enq_info.ts = ts; @@ -131,21 +167,23 @@ int trace_enqueue(u32 tgid, u32 pid, unsigned int runqlen) SEC("raw_tracepoint/sched_wakeup") int raw_tracepoint__sched_wakeup(struct bpf_raw_tracepoint_args *ctx) { + char abcde[16]; unsigned int runqlen = 0; struct task_struct *p = (void *)ctx->args[0]; runqlen = BPF_CORE_READ(p, se.cfs_rq, nr_running); - return trace_enqueue(_(p->tgid), _(p->pid), runqlen); + return trace_enqueue(p, runqlen); } SEC("raw_tracepoint/sched_wakeup_new") int raw_tracepoint__sched_wakeup_new(struct bpf_raw_tracepoint_args *ctx) { + char abcde[16]; unsigned int runqlen = 0; struct task_struct *p = (void *)ctx->args[0]; runqlen = BPF_CORE_READ(p, se.cfs_rq, nr_running); - return trace_enqueue(_(p->tgid), _(p->pid), runqlen); + return trace_enqueue(p, runqlen); } SEC("tp/sched/sched_switch") @@ -186,7 +224,7 @@ int handle_switch(struct trace_event_raw_sched_switch *ctx) unsigned int runqlen = 0; runqlen = BPF_CORE_READ(prev, se.cfs_rq, nr_running); - return trace_enqueue(_(prev->tgid), _(prev->pid), runqlen); + return trace_enqueue(prev, runqlen); } /* fetch timestamp and calculate delta */ enq = bpf_map_lookup_elem(&start, &pid); @@ -206,6 +244,7 @@ int handle_switch(struct trace_event_raw_sched_switch *ctx) event.rqlen = _(enq->rqlen); bpf_probe_read(event.task, sizeof(event.task), &(ctx->next_comm)); bpf_probe_read(event.prev_task, sizeof(event.prev_task), &(ctx->prev_comm)); + /* output */ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); diff --git a/source/tools/monitor/sched/schedmoni/runqslow.c b/source/tools/monitor/sched/schedmoni/runqslow.c index fcb456c7..d74e89b2 100644 --- a/source/tools/monitor/sched/schedmoni/runqslow.c +++ b/source/tools/monitor/sched/schedmoni/runqslow.c @@ -58,7 +58,7 @@ void *runslw_handler(void *arg) fprintf(stderr, "failed to open perf buffer: %d\n", err); goto clean_runslw; } - + /* todo: warm up */ while (!exiting) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { diff --git a/source/tools/monitor/sched/schedmoni/schedmoni.c b/source/tools/monitor/sched/schedmoni/schedmoni.c index c4b19f06..92e1f475 100644 --- a/source/tools/monitor/sched/schedmoni/schedmoni.c +++ b/source/tools/monitor/sched/schedmoni/schedmoni.c @@ -32,20 +32,22 @@ const char *argp_program_version = "schedmoni 0.1"; const char argp_program_doc[] = "Trace schedule latency.\n" "\n" -"USAGE: schedmoni [--help] [-s SPAN] [-t TID] [-P] [min_us] [-f ./runslow.log]\n" +"USAGE: schedmoni [--help] [-s SPAN] [-t TID] [-c COMM] [-P] [min_us] [-f LOGFILE]\n" "\n" "EXAMPLES:\n" " schedmoni # trace latency higher than 10000 us (default)\n" -" schedmoni -f a.log # trace latency and record result to a.log (default to /var/log/sysak/runslow.log)\n" +" schedmoni -f a.log # record result to a.log (default to ~sysak/schedmoni/schedmoni.log)\n" " schedmoni 1000 # trace latency higher than 1000 us\n" " schedmoni -p 123 # trace pid 123\n" " schedmoni -t 123 # trace tid 123 (use for threads only)\n" +" schedmoni -c bash # trace aplication who's name is bash\n" " schedmoni -s 10 # monitor for 10 seconds\n" " schedmoni -P # also show previous task name and TID\n"; static const struct argp_option opts[] = { { "pid", 'p', "PID", 0, "Process PID to trace"}, { "tid", 't', "TID", 0, "Thread TID to trace"}, + { "comm", 'c', "COMM", 0, "Name of the application"}, { "span", 's', "SPAN", 0, "How long to run"}, { "verbose", 'v', NULL, 0, "Verbose debug output" }, { "previous", 'P', NULL, 0, "also show previous task name and TID" }, @@ -100,6 +102,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (errno || pid <= 0) { fprintf(stderr, "Invalid PID: %s\n", arg); argp_usage(state); + return errno; } env.pid = pid; break; @@ -109,15 +112,30 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) if (errno || pid <= 0) { fprintf(stderr, "Invalid TID: %s\n", arg); argp_usage(state); + return errno; } env.tid = pid; break; + case 'c': + env.comm.size = strlen(arg); + if (env.comm.size < 1) { + fprintf(stderr, "Invalid COMM: %s\n", arg); + argp_usage(state); + return -1; + } + + if (env.comm.size > TASK_COMM_LEN - 1) + env.comm.size = TASK_COMM_LEN - 1; + + strncpy(env.comm.comm, arg, env.comm.size); + break; case 's': errno = 0; span = strtoul(arg, NULL, 10); if (errno || span <= 0) { fprintf(stderr, "Invalid SPAN: %s\n", arg); argp_usage(state); + return errno; } env.span = span; break; @@ -241,6 +259,7 @@ int main(int argc, char **argv) if (err) return err; + memset(&env.comm, 0, sizeof(struct comm_item)); err = argp_parse(&argp, argc, argv, 0, NULL, NULL); if (err) return err; @@ -272,6 +291,7 @@ int main(int argc, char **argv) ent_fd = bpf_map__fd(obj->maps.events); stk_fd = bpf_map__fd(obj->maps.stackmap); stkext_fd = bpf_map__fd(obj->maps.stackmap_ext); + args.comm_i = env.comm; args.targ_tgid = env.pid; args.targ_pid = env.tid; args.min_us = env.min_us; diff --git a/source/tools/monitor/sched/schedmoni/schedmoni.h b/source/tools/monitor/sched/schedmoni/schedmoni.h index ec74a522..16164442 100644 --- a/source/tools/monitor/sched/schedmoni/schedmoni.h +++ b/source/tools/monitor/sched/schedmoni/schedmoni.h @@ -8,11 +8,16 @@ #define TIF_NEED_RESCHED 1 #endif +struct comm_item { + char comm[TASK_COMM_LEN]; + unsigned long size; +}; struct args { __u64 min_us; pid_t targ_pid; pid_t targ_tgid; + struct comm_item comm_i; int flag; }; @@ -37,6 +42,7 @@ struct env { bool previous; bool verbose; void *fp; + struct comm_item comm; }; struct event { -- Gitee From 4e4ec99f20c4e17af1c092bbdbddf00e00a90651 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Mon, 11 Apr 2022 16:45:23 +0800 Subject: [PATCH 2/2] schedmoni: Add warm-up for schedmoni Before our monitor threads starting, the ebpf are loaded and this may lead to some unexpected datas recorded. Signed-off-by: Hailong Liu --- .../sched/schedmoni/bpf/schedmoni.bpf.c | 30 ++++++++++++++++--- .../tools/monitor/sched/schedmoni/runqslow.c | 14 +++++++-- .../tools/monitor/sched/schedmoni/schedmoni.c | 2 ++ .../tools/monitor/sched/schedmoni/schedmoni.h | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c b/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c index 198586c1..c23d203d 100644 --- a/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c +++ b/source/tools/monitor/sched/schedmoni/bpf/schedmoni.bpf.c @@ -73,7 +73,7 @@ struct { int i; \ unsigned long size = pcom.size; \ for (int i = 0; i < 16; i++) { \ - if (i > size) \ + if (i >= size) \ break; \ if (a[i] != pcom.comm[i]) { \ ret = false; \ @@ -82,6 +82,18 @@ struct { ret; \ }) +static bool program_ready(void) +{ + int i = 0; + struct args *argp; + bool ready = false; + + argp = bpf_map_lookup_elem(&argmap, &i); + if (argp) + ready = argp->ready; + return ready; +} + static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); @@ -143,8 +155,9 @@ static int trace_enqueue(struct task_struct *p, unsigned int runqlen) comm_eqaul = strequal(comm, comm_i); if (!comm_eqaul) return 0; - } else + } else { use_comm = false; + } } if (!use_comm) { @@ -167,10 +180,12 @@ static int trace_enqueue(struct task_struct *p, unsigned int runqlen) SEC("raw_tracepoint/sched_wakeup") int raw_tracepoint__sched_wakeup(struct bpf_raw_tracepoint_args *ctx) { - char abcde[16]; unsigned int runqlen = 0; struct task_struct *p = (void *)ctx->args[0]; + if (!program_ready()) + return 0; + runqlen = BPF_CORE_READ(p, se.cfs_rq, nr_running); return trace_enqueue(p, runqlen); } @@ -178,10 +193,12 @@ int raw_tracepoint__sched_wakeup(struct bpf_raw_tracepoint_args *ctx) SEC("raw_tracepoint/sched_wakeup_new") int raw_tracepoint__sched_wakeup_new(struct bpf_raw_tracepoint_args *ctx) { - char abcde[16]; unsigned int runqlen = 0; struct task_struct *p = (void *)ctx->args[0]; + if (!program_ready()) + return 0; + runqlen = BPF_CORE_READ(p, se.cfs_rq, nr_running); return trace_enqueue(p, runqlen); } @@ -200,6 +217,9 @@ int handle_switch(struct trace_event_raw_sched_switch *ctx) struct latinfo *latp; struct latinfo lati; + if (!program_ready()) + return 0; + prev_pid = ctx->prev_pid; pid = ctx->next_pid; prev_state = ctx->prev_state; @@ -271,6 +291,8 @@ int BPF_KPROBE(account_process_tick, struct task_struct *p, int user_tick) struct latinfo lati, *latp; struct args args, *argsp; + if (!program_ready()) + return 0; __builtin_memset(&args_key, 0, sizeof(int)); argsp = bpf_map_lookup_elem(&argmap, &args_key); if (!argsp) diff --git a/source/tools/monitor/sched/schedmoni/runqslow.c b/source/tools/monitor/sched/schedmoni/runqslow.c index d74e89b2..6511560d 100644 --- a/source/tools/monitor/sched/schedmoni/runqslow.c +++ b/source/tools/monitor/sched/schedmoni/runqslow.c @@ -40,7 +40,8 @@ void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) void *runslw_handler(void *arg) { - int err = 0; + int i = 0, err = 0; + struct args bpf_args; struct tharg *data = (struct tharg *)arg; struct perf_buffer *pb = NULL; struct perf_buffer_opts pb_opts = {}; @@ -58,7 +59,16 @@ void *runslw_handler(void *arg) fprintf(stderr, "failed to open perf buffer: %d\n", err); goto clean_runslw; } - /* todo: warm up */ + + memset(&bpf_args, 0, sizeof(bpf_args)); + bpf_map_lookup_elem(data->ext_fd, &i, &bpf_args); + bpf_args.ready = true; + err = bpf_map_update_elem(data->ext_fd, &i, &bpf_args, 0); + if (err) { + fprintf(stderr, "Failed to update flag map\n"); + goto clean_runslw; + } + while (!exiting) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { diff --git a/source/tools/monitor/sched/schedmoni/schedmoni.c b/source/tools/monitor/sched/schedmoni/schedmoni.c index 92e1f475..ed1424a5 100644 --- a/source/tools/monitor/sched/schedmoni/schedmoni.c +++ b/source/tools/monitor/sched/schedmoni/schedmoni.c @@ -296,6 +296,7 @@ int main(int argc, char **argv) args.targ_pid = env.tid; args.min_us = env.min_us; args.flag = TIF_NEED_RESCHED; + args.ready = false; err = bpf_map_update_elem(arg_fd, &i, &args, 0); if (err) { @@ -311,6 +312,7 @@ int main(int argc, char **argv) } runslw.fd = ent_fd; + runslw.ext_fd = arg_fd; err = pthread_create(&pt_runslw, NULL, runslw_handler, &runslw); if (err) { fprintf(stderr, "can't pthread_create runslw: %s\n", strerror(errno)); diff --git a/source/tools/monitor/sched/schedmoni/schedmoni.h b/source/tools/monitor/sched/schedmoni/schedmoni.h index 16164442..57ff5510 100644 --- a/source/tools/monitor/sched/schedmoni/schedmoni.h +++ b/source/tools/monitor/sched/schedmoni/schedmoni.h @@ -19,6 +19,7 @@ struct args { pid_t targ_tgid; struct comm_item comm_i; int flag; + bool ready; }; struct tharg { -- Gitee