diff --git a/source/lib/internal/kernel_module/Makefile b/source/lib/internal/kernel_module/Makefile index 5bd39d445217d3f47e28f9d323da0547ed18ffca..3fec106bbbaa4702def6b4967b77ec75f3ba9d09 100644 --- a/source/lib/internal/kernel_module/Makefile +++ b/source/lib/internal/kernel_module/Makefile @@ -27,6 +27,9 @@ endif ifneq ($(findstring taskctl,$(TARGET_LIST)),) sysak-objs += modules/task_ctl/task_ctrl.o endif +ifneq ($(findstring schedtrace,$(TARGET_LIST)),) +sysak-objs += modules/schedtrace/schedtrace.o +endif obj-m += sysak.o diff --git a/source/lib/internal/kernel_module/modules/schedtrace/schedtrace.c b/source/lib/internal/kernel_module/modules/schedtrace/schedtrace.c new file mode 100644 index 0000000000000000000000000000000000000000..176408d32d5d443c60e5b6e9df4a8a9c06bb0508 --- /dev/null +++ b/source/lib/internal/kernel_module/modules/schedtrace/schedtrace.c @@ -0,0 +1,278 @@ +#include +#include +#include /* regs_get_kernel_argument */ +#include /* PID_MAX_LIMIT */ +#include +#include +#include +#include +#include +#include "sysak_mods.h" +#include "common/proc.h" + +/* ARRAY_LEN is to define a trace buffer */ +#define ARRAY_LEN 1 +#define BUF_LEN 1024 +#define MAX_STACK_TRACE_DEPTH 8 + +struct tracepoints_probe { + struct tracepoint *tp; + char *name; +}; + +struct traceinfo { + int idx; + struct stack_trace trace[ARRAY_LEN]; + unsigned long entries[ARRAY_LEN][MAX_STACK_TRACE_DEPTH]; +}; + +static int trace_in_fly; +static int target_pid; +char buff[BUF_LEN] = {0}; +struct traceinfo traceinfos; + +struct tracepoints_probe mytp = { + .tp = NULL, + .name = "sched_switch", +}; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) +static struct tracepoint **swtc__start___tracepoints_ptrs; +static struct tracepoint **swtc__stop___tracepoints_ptrs; + +static int swtc_init_local_tracepoints(void) +{ + swtc__start___tracepoints_ptrs = (void *)kallsyms_lookup_name("__start___tracepoints_ptrs"); + swtc__stop___tracepoints_ptrs = (void *)kallsyms_lookup_name("__stop___tracepoints_ptrs"); + if (swtc__start___tracepoints_ptrs == NULL || swtc__stop___tracepoints_ptrs == NULL) { + return -1; + } + return 0; +} + +static void swtc_for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + fct(*iter, priv); +} + +/** + * for_each_kernel_tracepoint - iteration on all kernel tracepoints + * @fct: callback + * @priv: private data + */ +void swtc_for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + swtc_for_each_tracepoint_range(swtc__start___tracepoints_ptrs, + swtc__stop___tracepoints_ptrs, fct, priv); +} +#endif +static void tracepoint_lookup(struct tracepoint *tp, void *priv) +{ + struct tracepoints_probe *tps = priv; + + if (!strcmp(tp->name, tps->name)) + tps->tp = tp; +} + +static void +(*stack_save_regs)(struct pt_regs *regs, struct stack_trace *trace); +static void +(*stack_save_tsk)(struct task_struct *tsk, struct stack_trace *trace); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) +static void trace_sched_switch(void *priv, + struct task_struct *prev, + struct task_struct *next) + +#else +static void trace_sched_switch(void *priv, bool preempt, + struct task_struct *prev, + struct task_struct *next) +#endif +{ + struct task_struct *p; + int i, size = 0; + + p = prev; + if (((pid_t)target_pid == p->pid) && (p->state)) { + struct traceinfo *tf = &traceinfos; + struct stack_trace *trace = tf->trace; + int idx = tf->idx; + + tf->idx = (idx + 1)%ARRAY_LEN; + trace->nr_entries = 0; + trace->entries = tf->entries[idx]; + trace->max_entries = MAX_STACK_TRACE_DEPTH; + trace->skip = 1; + stack_save_tsk(prev, trace); + + idx = 0; + for (i = 0; i < trace->nr_entries - 1; i++) { + if ((void *)trace->entries[i]) { + size = sprintf(&buff[idx], "<%px>", (void *)(trace->entries[i])); + idx += size; + if (idx > BUF_LEN) + break; + size = sprint_symbol(&buff[idx], trace->entries[i]); + idx += size; + if (idx > BUF_LEN) + break; + size = sprintf(&buff[idx], ","); + idx += size; + if (idx > BUF_LEN) + break; + } + } + trace_printk("%s\n", buff); + memset(trace, 0, sizeof(struct stack_trace)); + } +} + +static int pid_show(struct seq_file *m, void *v) +{ + seq_printf(m, "pid=%d\n", target_pid); + return 0; +} + +static int pid_open(struct inode *inode, struct file *file) +{ + return single_open(file, pid_show, inode->i_private); +} + +static ssize_t pid_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (count <= 0 || count > PID_MAX_LIMIT) + return -EINVAL; + + if (kstrtoint_from_user(buf, count, 0, &target_pid)) { + pr_warn("copy_from_user fail\n"); + return -EFAULT; + } + + if (target_pid < 0 && target_pid != -1) + return -EINVAL; + + if (target_pid == -1 && trace_in_fly) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + tracepoint_probe_unregister(mytp.name, trace_sched_switch, NULL); +#else + tracepoint_probe_unregister(mytp.tp, trace_sched_switch, NULL); +#endif + trace_in_fly = 0; + } else if (target_pid > 0 && !trace_in_fly) { + int ret; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + ret = tracepoint_probe_register(mytp.name, trace_sched_switch, NULL); +#else + ret = tracepoint_probe_register(mytp.tp, trace_sched_switch, NULL); +#endif + if (ret) + trace_in_fly = 1; + else + return ret; + } + return count; +} + +static struct file_operations pid_fops = { + .owner = THIS_MODULE, + .read = seq_read, + .open = pid_open, + .write = pid_write, + .release = seq_release, +}; + +static int proc_init(void) +{ + struct proc_dir_entry *parent; + + parent = sysak_proc_mkdir("schedtrace"); + if (!parent) + return -ENOMEM; + + if(!proc_create("pid", + S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP, + parent, + &pid_fops)) + goto proc_fail; + pr_info("proc_init schedtrace success\n"); + return 0; + +proc_fail: + sysak_remove_proc_entry("schedtrace"); + return -ENOMEM; +} + +int schedtrace_init(void) +{ + int ret; + + mytp.tp = NULL; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + if (swtc_init_local_tracepoints()) + return -ENODEV; +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + swtc_for_each_kernel_tracepoint(tracepoint_lookup, &mytp); +#else + for_each_kernel_tracepoint(tracepoint_lookup, &mytp); +#endif + if (mytp.tp) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + ret = tracepoint_probe_register(mytp.name, trace_sched_switch, NULL); +#else + ret = tracepoint_probe_register(mytp.tp, trace_sched_switch, NULL); +#endif + if (ret) { + pr_warn("sched_switch probe fail\n"); + return ret; + } + trace_in_fly = 1; + } + + stack_save_tsk = (void *)kallsyms_lookup_name("save_stack_trace_tsk"); + stack_save_regs = (void *)kallsyms_lookup_name("save_stack_trace_regs"); + + if (!stack_save_tsk || !stack_save_regs) { + ret = -EINVAL; + goto fail; + } + + ret = proc_init(); + if (ret < 0) { + pr_warn("proc_init fail\n"); + goto fail; + } + + target_pid = -1; + return ret; +fail: +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + tracepoint_probe_unregister(mytp.name, trace_sched_switch, NULL); +#else + tracepoint_probe_unregister(mytp.tp, trace_sched_switch, NULL); +#endif + return ret; +} + +void schedtrace_exit(void) +{ + if (trace_in_fly) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + tracepoint_probe_unregister(mytp.name, trace_sched_switch, NULL); +#else + tracepoint_probe_unregister(mytp.tp, trace_sched_switch, NULL); +#endif +} diff --git a/source/lib/internal/kernel_module/sysak_mods.c b/source/lib/internal/kernel_module/sysak_mods.c index 1370a9a3037985ce5319ce07b73b8b4228321903..021c9d1216caed0cfd22ee1815834de44ef861c7 100644 --- a/source/lib/internal/kernel_module/sysak_mods.c +++ b/source/lib/internal/kernel_module/sysak_mods.c @@ -40,11 +40,22 @@ int __attribute__((weak)) task_ctl_exit(void) return 0; } +int __attribute__((weak)) schedtrace_init(void) +{ + return 0; +} + +int __attribute__((weak)) schedtrace_exit(void) +{ + return 0; +} + struct sysak_module sysak_modules[] = { { "trace_sig", trace_sig_init, trace_sig_exit}, { "memleak", memleak_init, memleak_uninit}, { "trace_irqoff", trace_irqoff_init, trace_irqoff_exit}, { "task_ctl", task_ctl_init, task_ctl_exit}, + { "schedtrace", schedtrace_init, schedtrace_exit}, }; const int sysk_module_num = sizeof(sysak_modules) / sizeof(struct sysak_module); diff --git a/source/tools/monitor/sched/nosched/Makefile b/source/tools/monitor/sched/nosched/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0fa6285765af8f3a3470d03ad6c683bf472fcc45 --- /dev/null +++ b/source/tools/monitor/sched/nosched/Makefile @@ -0,0 +1,8 @@ + +newdirs := $(shell find ./ -type d) + +bpfsrcs := $(wildcard bpf/*.bpf.c) +csrcs := $(wildcard *.c) +target := nosched + +include $(SRC)/mk/bpf.mk diff --git a/source/tools/monitor/sched/nosched/README.txt b/source/tools/monitor/sched/nosched/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..46457a5def29314ce0ce08aaf9800860cd4e9510 --- /dev/null +++ b/source/tools/monitor/sched/nosched/README.txt @@ -0,0 +1,77 @@ +1 run the nosched + $ sudo ./out/sysak nosched #default threshold 10ms + or + $ sudo ./out/sysak nosched -t 1 #set the threshold to 1ms + +The out looks like: +sudo ./out/sysak nosched -t 1 +Threshold set to 1 ms +libbpf: loading object 'nosched_bpf' from buffer +..... (#a lot of messages) +Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` to see output of the BPF programs. +Running.... + tips:Ctl+c show the result! + +2 get the result +2.1 use trace_pipe(Optional, for debug) + $ sudo cat /sys/kernel/debug/tracing/trace_pipe + <...>-110777 [014] dNh. 19777.314467: 0: cc1 :lat is 1001 us, 1 ticks + <...>-110849 [016] dNh. 19777.556471: 0: cc1 :lat is 1000 us, 1 ticks + <...>-110712 [000] dNh. 19777.932467: 0: cc1 :lat is 1005 us, 1 ticks +2.2 stop the process and get the result +We enter the "Ctl+c" to stop the process, the result looks as follows: +Running.... + tips:Ctl+c show the result! +^C +*********************************** +cc1<116321> [19795.442018507]: lat=4000us, lat_tick=4 + apic_timer_interrupt + __lock_text_start + release_pages + tlb_flush_mmu_free + arch_tlb_finish_mmu + tlb_finish_mmu + exit_mmap + mmput + do_exit + do_group_exit + get_signal + do_signal + exit_to_usermode_loop + prepare_exit_to_usermode + swapgs_restore_regs_and_return_to_usermode +---------------------- +cc1<111581> [19775.265934964]: lat=1005us, lat_tick=1 + apic_timer_interrupt + free_unref_page_list + release_pages + tlb_flush_mmu_free + arch_tlb_finish_mmu + tlb_finish_mmu + exit_mmap + mmput + do_exit + do_group_exit + __x64_sys_exit_group + do_syscall_64 + entry_SYSCALL_64_after_hwframe +..........(#a lot of messages) + +3 the results +3.1 headers + comm&pid timestamp latency(us) latency(tick) + | | | | +cc1<111581> [19775.265934964]: lat=1005us, lat_tick=1 + +comm&pid: The name(or comm) and pid of the task which with need_to_resched flag but didn't schedle() for threshold time. +timestamp: The timestamp when no_sched happened. +latency(us): How many us the task with need_to_resched flag has no schedule(). +latency(tick): Likes latency, but takes ticks as count. + +3.2 stack +The stack back-trace of the current(the murderer) context. + apic_timer_interrupt + free_unref_page_list + release_pages + tlb_flush_mmu_free +...... diff --git a/source/tools/monitor/sched/nosched/bpf/nosched.bpf.c b/source/tools/monitor/sched/nosched/bpf/nosched.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..9cba4a19e12bd80cd9e814b6441e73a7c39c2666 --- /dev/null +++ b/source/tools/monitor/sched/nosched/bpf/nosched.bpf.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2021 Sartura */ +/* + * Todo: 1. how to distinguish CONFIG_THREAD_INFO_IN_TASK configured? + * 2. why #ifdef __x86_64__ not work at nosched.bpf.c? + * 3. some magic NUMBER, like max_entries, need to be configable + * */ +#include +#include +#include +#include +#include "../nosched.comm.h" + +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) + +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BITS_PER_LONG 64 +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) + +struct bpf_map_def SEC("maps") args_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(struct args), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .key_size = sizeof(u32), + .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") stackmap_ext = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct ext_key), + .value_size = sizeof(struct ext_val), + .max_entries = 10000, +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, MAX_MONI_NR); + __type(key, u64); + __type(value, struct latinfo); +} info_map SEC(".maps"); + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +static inline int test_ti_thread_flag(struct thread_info *ti, int nr) +{ + int result; + unsigned long *addr; + unsigned long tmp = _(ti->flags); + + addr = &tmp; + result = 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); + return result; +} + +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + struct thread_info tf, *tfp; +//#ifdef CONFIG_THREAD_INFO_IN_TASK + tfp = &(tsk->thread_info); +//#elif !defined(__HAVE_THREAD_FUNCTIONS) +//# define task_thread_info(task) ((struct thread_info *)(task)->stack) +//#endif + bpf_probe_read(&tf, sizeof(tf), &(tsk->thread_info)); + tfp = &tf; + return test_ti_thread_flag(tfp, flag); +} + +static inline int test_tsk_need_resched(struct task_struct *tsk, int flag) +{ + return test_tsk_thread_flag(tsk, flag); +} + +SEC("kprobe/account_process_tick") +int BPF_KPROBE(account_process_tick, struct task_struct *p, int user_tick) +{ + int args_key; + u64 cpuid; + u64 resched_latency, now; + struct latinfo lati, *latp; + struct args args, *argsp; + + + __builtin_memset(&args_key, 0, sizeof(int)); + argsp = bpf_map_lookup_elem(&args_map, &args_key); + if (!argsp) + return 0; + + if(!test_tsk_need_resched(p, _(argsp->flag))) + //||system_state == SYSTEM_BOOTING) + return 0; + + now = bpf_ktime_get_ns(); + + __builtin_memset(&cpuid, 0, sizeof(u64)); + cpuid = bpf_get_smp_processor_id(); + latp = bpf_map_lookup_elem(&info_map, &cpuid); + if (latp) { + if (!latp->last_seen_need_resched_ns) { + latp->last_seen_need_resched_ns = now; + latp->ticks_without_resched = 0; + } else { + latp->ticks_without_resched++; + resched_latency = now - latp->last_seen_need_resched_ns; + if (resched_latency > _(argsp->thresh)) { + struct key_t key; + struct ext_key ext_key; + struct ext_val ext_val; + + __builtin_memset(&key, 0, sizeof(struct key_t)); + __builtin_memset(&ext_key, 0, sizeof(struct ext_key)); + __builtin_memset(&ext_val, 0, sizeof(struct ext_val)); + key.ret = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); + ext_key.stamp = now; + ext_key.ret = key.ret; + ext_val.lat_us = resched_latency/1000; + bpf_get_current_comm(&ext_val.comm, sizeof(ext_val.comm)); + ext_val.pid = bpf_get_current_pid_tgid(); + ext_val.nosched_ticks = latp->ticks_without_resched; + bpf_map_update_elem(&stackmap_ext, &ext_key, &ext_val, BPF_ANY); + bpf_printk("%s :lat is %ld us, %d ticks\n", ext_val.comm, + resched_latency/1000, latp->ticks_without_resched); + } + } + } else { + __builtin_memset(&lati, 0, sizeof(struct latinfo)); + lati.last_seen_need_resched_ns = now; + lati.ticks_without_resched = 0; + bpf_map_update_elem(&info_map, &cpuid, &lati, BPF_ANY); + } + + return 0; +} + +/* +struct trace_event_raw_sched_switch { + struct trace_entry ent; + char prev_comm[16]; + pid_t prev_pid; + int prev_prio; + long int prev_state; + char next_comm[16]; + pid_t next_pid; + int next_prio; + char __data[0]; +}; + */ +SEC("tp/sched/sched_switch") +int handle_switch(struct trace_event_raw_sched_switch *ctx) +{ + u64 cpuid; + struct latinfo lati, *latp; + + cpuid = bpf_get_smp_processor_id(); + latp = bpf_map_lookup_elem(&info_map, &cpuid); + if (latp) { + latp->last_seen_need_resched_ns = 0; + } else { + __builtin_memset(&lati, 0, sizeof(struct latinfo)); + lati.last_seen_need_resched_ns = 0; + lati.ticks_without_resched = 0; + bpf_map_update_elem(&info_map, &cpuid, &lati, BPF_ANY); + } + + return 0; +} diff --git a/source/tools/monitor/sched/nosched/nosched.c b/source/tools/monitor/sched/nosched/nosched.c new file mode 100644 index 0000000000000000000000000000000000000000..6573ac8feef4393ae1e0683798ab45891666f0a2 --- /dev/null +++ b/source/tools/monitor/sched/nosched/nosched.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Sartura + * Based on minimal.c by Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* bpf_obj_pin */ +#include +#include "nosched.comm.h" +#include "bpf/nosched.skel.h" + +#define MAX_SYMS 300000 +static struct ksym syms[MAX_SYMS]; +static int sym_cnt; +char *help_str = "sysak nosched"; + +static void usage(char *prog) +{ + const char *str = + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -t specify the threshold time(ms), default=10ms\n" + ; + + fprintf(stderr, str, prog); + exit(EXIT_FAILURE); +} + +static int ksym_cmp(const void *p1, const void *p2) +{ + return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; +} + +int load_kallsyms(void) +{ + FILE *f = fopen("/proc/kallsyms", "r"); + char func[256], buf[256]; + char symbol; + void *addr; + int i = 0; + + if (!f) + return -ENOENT; + + while (!feof(f)) { + if (!fgets(buf, sizeof(buf), f)) + break; + if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) + break; + if (!addr) + continue; + syms[i].addr = (long) addr; + syms[i].name = strdup(func); + i++; + } + fclose(f); + sym_cnt = i; + qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); + return 0; +} + +struct ksym *ksym_search(long key) +{ + int start = 0, end = sym_cnt; + int result; + + /* kallsyms not loaded. return NULL */ + if (sym_cnt <= 0) + return NULL; + + while (start < end) { + size_t mid = start + (end - start) / 2; + + result = key - syms[mid].addr; + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return &syms[mid]; + } + + if (start >= 1 && syms[start - 1].addr < key && + key < syms[start].addr) + /* valid ksym */ + return &syms[start - 1]; + + /* out of range. return _stext */ + return &syms[0]; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} + +static void bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { + fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); + exit(1); + } +} + +static void print_ksym(__u64 addr) +{ + struct ksym *sym; + + if (!addr) + return; + + sym = ksym_search(addr); + printf("<%llx> %s\n", addr, sym->name); +} + +static void print_stack(int fd, struct ext_key *key) +{ + int i; + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + + if (bpf_map_lookup_elem(fd, &key->ret, &ip) == 0) { + for (i = 7; i < PERF_MAX_STACK_DEPTH; i++) + print_ksym(ip[i]); + } else { + if ((int)(key->ret) < 0) + printf("<0x0000000000000000>:error=%d\n", (int)(key->ret)); + } +} + +static void print_stacks(int fd, int ext_fd) +{ + struct ext_key ext_key = {}, next_key; + struct ext_val value; + + printf("***********************************\n"); + while (bpf_map_get_next_key(ext_fd, &ext_key, &next_key) == 0) { + bpf_map_lookup_elem(ext_fd, &next_key, &value); + printf("%s<%d> [%lld.%lld]: lat=%lldus, lat_tick=%d\n", + value.comm, value.pid, next_key.stamp/(1000000000), + next_key.stamp%(1000000000), value.lat_us, + value.nosched_ticks); + print_stack(fd, &next_key); + printf("----------------------\n"); + bpf_map_delete_elem(ext_fd, &next_key); + ext_key = next_key; + } + printf("\n"); +} + +static volatile sig_atomic_t stop; + +static void sig_int(int signo) +{ + stop = 1; +} + +int main(int argc, char **argv) +{ + struct nosched_bpf *skel; + struct args args; + int c, option_index, args_key; + unsigned long val; + int err, map_fd0, /*map_fd1,*/ map_fd2, map_fd3; + + val = LAT_THRESH_NS; + for (;;) { + c = getopt_long(argc, argv, "t:h", NULL, &option_index); + if (c == -1) + break; + + switch (c) { + case 't': + val = (int)strtoul(optarg, NULL, 10); + if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) + || (errno != 0 && val == 0)) { + perror("strtol"); + return errno; + } + printf("Threshold set to %ld ms\n", val); + val = val*1000*1000; + break; + case 'h': + usage(help_str); + break; + default: + usage(help_str); + } + } + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Bump RLIMIT_MEMLOCK to allow BPF sub-system to do anything */ + bump_memlock_rlimit(); + err = load_kallsyms(); + if (err) { + fprintf(stderr, "Failed to load kallsyms\n"); + return err; + } + /* Open load and verify BPF application */ + skel = nosched_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + err = nosched_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load BPF skeleton\n"); + return 1; + } + /* Attach tracepoint handler */ + err = nosched_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to attach BPF skeleton\n"); + goto cleanup; + } + + map_fd0 = bpf_map__fd(skel->maps.args_map); + /* map_fd1 = bpf_map__fd(skel->maps.info_map); */ + map_fd2 = bpf_map__fd(skel->maps.stackmap); + map_fd3 = bpf_map__fd(skel->maps.stackmap_ext); + if (signal(SIGINT, sig_int) == SIG_ERR) { + fprintf(stderr, "can't set signal handler: %s\n", strerror(errno)); + goto cleanup; + } + + args_key = 0; + args.flag = TIF_NEED_RESCHED; + args.thresh = val; + err = bpf_map_update_elem(map_fd0, &args_key, &args, 0); + if (err) { + fprintf(stderr, "Failed to update flag map\n"); + goto cleanup; + } + + printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " + "to see output of the BPF programs.\n"); + + fprintf(stderr, "Running....\n tips:Ctl+c show the result!\n"); + while (!stop) { + sleep(1); + } + printf("\n"); + print_stacks(map_fd2, map_fd3); +cleanup: + nosched_bpf__destroy(skel); + return -err; +} diff --git a/source/tools/monitor/sched/nosched/nosched.comm.h b/source/tools/monitor/sched/nosched/nosched.comm.h new file mode 100644 index 0000000000000000000000000000000000000000..d6c4acdadf04c44af5025ac11e948cdb7d701f8c --- /dev/null +++ b/source/tools/monitor/sched/nosched/nosched.comm.h @@ -0,0 +1,46 @@ +#define BPF_ANY 0 +#define MAX_MONI_NR 1024 + +/* latency thresh:10ms*/ +#define LAT_THRESH_NS (10*1000*1000) +#define TASK_COMM_LEN 16 +#define PERF_MAX_STACK_DEPTH 32 +#define TASK_COMM_LEN 16 +#define PERF_MAX_STACK_DEPTH 32 + +#ifdef __x86_64__ +#define TIF_NEED_RESCHED 3 +#elif defined (__aarch64__) +#define TIF_NEED_RESCHED 1 +#endif + +struct args { + int flag; + __u64 thresh; +}; + +struct ksym { + long addr; + char *name; +}; + +struct key_t { + __u32 ret; +}; + +struct ext_key { + __u32 ret; + __u64 stamp; +}; + +struct ext_val { + int pid; + int nosched_ticks; + __u64 lat_us; + char comm[TASK_COMM_LEN]; +}; + +struct latinfo { + __u64 last_seen_need_resched_ns; + int ticks_without_resched; +}; diff --git a/source/tools/monitor/sched/schedtrace/Makefile b/source/tools/monitor/sched/schedtrace/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5a3aff5aa647679dfa6e07e6dbaa2ad98666ebb9 --- /dev/null +++ b/source/tools/monitor/sched/schedtrace/Makefile @@ -0,0 +1,5 @@ +mods := schedtrace +DEPEND := "prev{-e modin};post{-d modun}" +target := schedtrace + +include $(SRC)/mk/py.mk diff --git a/source/tools/monitor/sched/schedtrace/schedtrace.py b/source/tools/monitor/sched/schedtrace/schedtrace.py new file mode 100755 index 0000000000000000000000000000000000000000..e4eeb4306cffdd151f63bb560f72ff1fcbb32426 --- /dev/null +++ b/source/tools/monitor/sched/schedtrace/schedtrace.py @@ -0,0 +1,317 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +import sys,os +import getopt +import json +import linecache +import subprocess +from collections import OrderedDict + +eventpid = 0 +enable_level = 0 +disable_level = 0 +MAX_OUT_FILE = 512*1024*1024 +ONE_FILE_MAX = 64*1024*1024 +FILE_CACHE_SIZE = 4096 + +traceoutf = "/sys/kernel/debug/tracing/trace" +pidfile = "/sys/kernel/debug/tracing/set_event_pid" +traceswfile = "/proc/sysak/schedtrace/pid" + +#level low +tracefs1 = ("/sys/kernel/debug/tracing/events/sched/sched_wakeup/enable", + "/sys/kernel/debug/tracing/events/sched/sched_migrate_task/enable", + "/sys/kernel/debug/tracing/events/sched/sched_switch/enable", + "/sys/kernel/debug/tracing/events/sched/sched_process_exit/enable") + +#level middle +tracefs2 = ("/sys/kernel/debug/tracing/events/sched/sched_stat_iowait/enable", + "/sys/kernel/debug/tracing/events/signal/signal_deliver/enable", + "/sys/kernel/debug/tracing/events/signal/signal_generate/enable") + +#level high +tracefs3 = ("/sys/kernel/debug/tracing/events/irq/irq_handler_entry/enable", + "/sys/kernel/debug/tracing/events/irq/irq_handler_exit/enable", + "/sys/kernel/debug/tracing/events/irq/softirq_entry/enable", + "/sys/kernel/debug/tracing/events/irq/softirq_exit/enable", + "/sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/enable", + "/sys/kernel/debug/tracing/events/raw_syscalls/sys_exit/enable") + +tracefiles = (tracefs1, tracefs2, tracefs3) + +def check_essential_files(): + if not os.path.isfile(traceswfile): + print 'WARN: no such file:%s' % traceswfile + print ' --use root or check [schedtrace.ko] loaded yet?\n' + if not os.path.isfile(traceoutf): + print 'WARN: no such file:%s' % traceoutf + print ' --use root or check debugfs/tracefs mounted yet?--\n' + if not os.path.isfile(pidfile): + print 'WARN: no such file:%s' % pidfile + print ' --use root or check debugfs/tracefs mounted yet?--\n' + i = 0 + not_exist = 0 + while i < 3: + traces = tracefiles[i] + for tracef in traces: + if not os.path.isfile(tracef): + not_exist = not_exist + 1 + i = i + 1 + if not_exist > 1: + print 'WARN: some files of /sys/kernel/debug/tracing/event/ not exist' + print ' --use root or check debugfs/tracefs mounted yet?--' + + return + +def disable_all_trace(): + i = 0 + while i < 3: + traces = tracefiles[i] + for tracef in traces: + subprocess.call("echo 0 >"+tracef, shell=True) + i = i + 1 + return + +def parse_info(event, info, pid): + if event == 'sched_switch': + strs = info.split('==>') + if pid in strs[0]: + event = 'switch_out' + elif pid in strs[1]: + event = 'switch_in' + else: + print 'error:pid=%s, strs0=%s, strs1=%s' %(pid, strs[0], strs[1]) + return event + +def raw_to_json(inf, outf): + ifp = open(inf, "r") + ofp = open(outf, "w") + line1st = linecache.getline(inf, 1) + target_pid = line1st.split('#')[-1].strip('\n') + for line in ifp.readlines(): + if line[0] == '#': + continue + strs = line.split(':', 2) + if len(strs) < 3: + print 'line:\n %s not valide' % line + continue + dic = OrderedDict() + dic['time'] = strs[0].split()[-1] + info = strs[2].strip('\n') + event = strs[1].split()[0] + dic['event'] = parse_info(event, info, target_pid) + dic['info'] = info + #dic = {'time':float(time), 'event':event, 'info':info} #dictory is disorder + ofp.write(json.dumps(dic, sort_keys=False)+',\n') + ifp.close() + ofp.close() + +def list_config(): + p = subprocess.Popen("cat "+pidfile, shell=True, stdout=subprocess.PIPE) + p.wait() + print '%s=%s' % (pidfile, p.stdout.readline().strip('\n')) + i = 0 + while i < 3: + traces = tracefiles[i] + for tracef in traces: + p = subprocess.Popen("cat "+tracef, shell=True, stdout=subprocess.PIPE) + p.wait() + print '%s=%s' % (tracef, p.stdout.readline().strip('\n')) + i = i + 1 + return + +def switch_fp(curr, t1, t2): + if curr == t1: + return t2 + elif curr == t2: + return t1 + else: + print 'Warn: switch_fp() curr not in t1 nor t2' + return curr + +def record_traceinfo(outfile): + tmp1 = outfile+'1' + tmp2 = outfile+'2' + trfp = open(traceoutf, 'r') + t1fp = open(tmp1, 'w') + t2fp = open(tmp2, 'w') + total = 0 + count = 0 + tmpfp1 = t1fp + line1st = linecache.getline(pidfile, 1) + pid = line1st.split()[0] + tmpfp1.write('#'+pid+'\n') + try: + while True: + s = trfp.read(FILE_CACHE_SIZE) + if s == '': + break + tmpfp1.write(s) + count = count + FILE_CACHE_SIZE + total = total + FILE_CACHE_SIZE + if total > MAX_OUT_FILE: + break; + if count > ONE_FILE_MAX: + tmpfp1 = switch_fp(tmpfp1, t1fp, t2fp) + tmpfp1.seek(0, 0) + tmpfp1.write('#'+pid+'\n') + count = 0 + finally: + tmpfp2 = switch_fp(tmpfp1, t1fp, t2fp) + filename1 = tmpfp1.name + filename2 = tmpfp2.name + index1 = tmpfp1.tell() + index2 = tmpfp2.tell() + trfp.close() + tmpfp1.close() + tmpfp2.close() + if index1 == 0 and index2 == 0: + print '-- Notice: traced nothing --' + os.remove(filename1) + os.remove(filename2) + return + + if index1 != 0: + os.rename(filename1 , outfile) + if index2 != 0: + os.rename(filename2 , outfile+'.old') + else: + os.remove(filename2) + else: + os.rename(filename2, outfile) + print 'write total=%ld KB' % (total/1024) + +def usage(app): + print '==============' + print 'Usage:' + print ' -l/--list list the current trace config' + print ' -p/--pid set the target pid we want to trace' + print ' -s/--size set limit size of the output-file, MB' + print ' size: limit the size of the output file, default 512MB' + print ' -r/--read read the trace info to outfile' + print ' stdout: will print the result to stdout console' + print ' -j/--json trans the trace logfile to json' + print ' logfile: the source file for json' + print ' -e/--enable enable sched trace with -p/--pid' + print ' l: enabel low-level trace' + print ' m: enabel middle-level trace' + print ' h: enabel high-level trace' + print ' -d/--disable disable sched trace with' + print ' l: disabel low-level trace' + print ' m: disabel middle-level trace' + print ' h: disabel high-level trace' + print ' p: disabel trace target_pid' + print '==============' + return + +if len(sys.argv) < 2: + usage(sys.argv[0]) + sys.exit("invalide args") + +try: + opts,args = getopt.getopt(sys.argv[1:], 'hle:d:p:s:r:j:',['help','list', 'enable=','disable=','pid=','size=','read=','json=']) +except getopt.GetoptError, e: + sys.stderr.write("Error:[%s] %s\n" % (sys.argv[0].strip(".py"), e.msg)) + print 'Use -h/--help see useage!' + sys.exit(1) +for opt_name,opt_value in opts: + if opt_name in ('-h','--help'): + usage(sys.argv[0]) + sys.exit() + if opt_name in ('-l','--list'): + check_essential_files() + list_config() + sys.exit() + if opt_name in ('-p', '--pid'): + eventpid = (int)(opt_value) + continue + if opt_name in ('-s', '--size'): + ONE_FILE_MAX = 1024*1024*((int)(opt_value)) + continue + if opt_name in ('-j', '--json'): + raw_to_json(opt_value, opt_value+'.json') + continue + if opt_name in ('-r', '--read'): + outfile = opt_value + if outfile == 'stdout': + subprocess.call("cat "+traceoutf, shell=True) + else: + print 'write strace log to %s' % outfile + disable_all_trace() #should? + record_traceinfo(outfile) + subprocess.call("echo >"+traceoutf, shell=True) + subprocess.call("echo -1 >"+traceswfile, shell=True) + continue + if opt_name in ('-e', '--enable'): + enopt = opt_value + if enopt == 'l': + enable_level = 1 + elif enopt == 'm': + enable_level = 3 + elif enopt == 'h': + enable_level = 7 + else: + usage(sys.argv[0]) + sys.exit(" ::-e/--enable with invalid value") + continue + if opt_name in ('-d', '--disable'): + enopt = opt_value + if enopt == 'l': + disable_level = 7 + elif enopt == 'm': + disable_level = 6 + elif enopt == 'h': + disable_level = 4 + elif enopt == 'p': + disable_level = 8 + else: + usage(sys.argv[0]) + sys.exit(" ::-d/--disable with invalid value") + continue + else: + usage(sys.argv[0]) + sys.exit(" ::undefined opt "+opt_name) + +if enable_level: + if eventpid == 0: + usage(sys.argv[0]) + sys.exit(" ::target pid must be set first, use -p") + if disable_level: + sys.exit(" ::enabel can't use with disable together, only -d or -e") + check_essential_files() + i = 0 + subprocess.call("echo "+str(eventpid)+" >"+pidfile, shell=True) + subprocess.call("echo "+str(eventpid)+" >"+traceswfile, shell=True) + while i < 3: + if (enable_level & (1 << i)): + for tracefile in tracefiles[i]: + subprocess.call("echo 1 >"+tracefile, shell=True) + i = i + 1 +elif disable_level: + if enable_level: + sys.exit(" ::enabel can't use with disable together, only -d or -e") + check_essential_files() + ##this will be little complex, + #but we usage a easy and ‘thick line’ way now times + if disable_level == 8: + if eventpid == 0: + sys.exit(" ::disable trace pid must used with -p ") + else: + disable_level = 7 + i = 0 + while i < 3: + if disable_level & (1 << i): + for tracefile in tracefiles[i]: + subprocess.call("echo 0 >"+tracefile, shell=True) + i = i + 1 + subprocess.call('echo >'+pidfile, shell=True) + sys.exit() + + i = 0 + while i < 3: + if disable_level & (1 << i): + for tracefile in tracefiles[i]: + subprocess.call("echo 0 >"+tracefile, shell=True) + i = i + 1 + subprocess.call("echo -1 >"+traceswfile, shell=True)