diff --git a/source/tools/detect/sched/tasktop/tasktop.c b/source/tools/detect/sched/tasktop/tasktop.c index 8ea64dbb78643f4f0e6926715fc166b6c08f6dff..566f380d4e3dac06ddefdc57fa14cf631356ff2c 100644 --- a/source/tools/detect/sched/tasktop/tasktop.c +++ b/source/tools/detect/sched/tasktop/tasktop.c @@ -21,6 +21,7 @@ #include "common.h" // #define DEBUG +// #define LOG_DEBUG // #define ONLY_THREAD // #define STRESS_TEST @@ -372,7 +373,48 @@ cleanup: return 0; } -static int read_cgroup_throttle(cgroup_cpu_stat_t* cgroups, int* cgroup_num) { +static int check_cgroup(cgroup_cpu_stat_t** prev_cgroups, char* cgroup_name) { + int i = 0; + for (i = 0; i < env.cgroup_limit; i++) { + if (prev_cgroups[i] && + !strcmp(cgroup_name, prev_cgroups[i]->cgroup_name)) { + return i; + } + } + return -1; +} + +static cgroup_cpu_stat_t* get_cgroup(cgroup_cpu_stat_t** prev_cgroups, + int idx) { + int i = 0; + if (idx < env.cgroup_limit && idx >= 0) { + return prev_cgroups[idx]; + } + + /* find a empty slot */ + for (i = 0; i < env.cgroup_limit; i++) { + if (!prev_cgroups[i]) { + prev_cgroups[i] = calloc(1, sizeof(cgroup_cpu_stat_t)); + prev_cgroups[i]->last_update = time(0); + return prev_cgroups[i]; + } + } + + /* find the long time no update slot */ + int last_time = prev_cgroups[0]->last_update; + int res_idx = 0; + for (i = 0; i < env.cgroup_limit; i++) { + if (last_time > prev_cgroups[i]->last_update) { + last_time = prev_cgroups[i]->last_update; + res_idx = i; + } + } + + return prev_cgroups[res_idx]; +} + +static int read_cgroup_throttle(cgroup_cpu_stat_t* cgroups, int* cgroup_num, + cgroup_cpu_stat_t** prev_cgroups) { #define CGROUP_PATH "/sys/fs/cgroup/cpu" int err = 0; struct dirent* dir = 0; @@ -383,6 +425,7 @@ static int read_cgroup_throttle(cgroup_cpu_stat_t* cgroups, int* cgroup_num) { fprintf(stderr, "Failed open %s\n", CGROUP_PATH); goto cleanup; } + time_t ts = time(0); while ((dir = readdir(root_dir)) != NULL) { char name[128]; @@ -405,24 +448,48 @@ static int read_cgroup_throttle(cgroup_cpu_stat_t* cgroups, int* cgroup_num) { continue; } - cgroup_cpu_stat_t* stat = cgroups + c_num; - memset(stat->cgroup_name, 0, sizeof(stat->cgroup_name)); + /* if idx == -1,means no history record */ + int idx = check_cgroup(prev_cgroups, dir->d_name); + + /* find a slot, store the history cgroup info */ + cgroup_cpu_stat_t* slot = get_cgroup(prev_cgroups, idx); + if (!slot) { + fprintf(stderr, "Get a null cgroup slot.\n"); + err = 1; + exit(err); + } + + cgroup_cpu_stat_t* rec = cgroups + c_num; + + memset(slot->cgroup_name, 0, sizeof(slot->cgroup_name)); + memset(rec->cgroup_name, 0, sizeof(rec->cgroup_name)); + + strncpy(slot->cgroup_name, dir->d_name, sizeof(slot->cgroup_name) - 1); + strncpy(rec->cgroup_name, dir->d_name, sizeof(rec->cgroup_name) - 1); - strncpy(stat->cgroup_name, dir->d_name, sizeof(stat->cgroup_name) - 1); while (fscanf(fp, "%s %llu", name, &val) != EOF) { if (!strcmp(name, "nr_periods")) { - stat->nr_periods = val; + if (idx != -1) rec->nr_periods = val - slot->nr_periods; + slot->nr_periods = val; } else if (!strcmp(name, "nr_throttled")) { - stat->nr_throttled = val; + if (idx != -1) rec->nr_throttled = val - slot->nr_throttled; + slot->nr_throttled = val; } else if (!strcmp(name, "throttled_time")) { - stat->throttled_time = val; + if (idx != -1) rec->throttled_time = val - slot->throttled_time; + slot->throttled_time = val; } else if (!strcmp(name, "nr_burst")) { - stat->nr_burst = val; + if (idx != -1) rec->nr_burst = val - slot->nr_burst; + slot->nr_burst = val; } else if (!strcmp(name, "burst_time")) { - stat->burst_time = val; + if (idx != -1) rec->burst_time = val - slot->burst_time; + slot->burst_time = val; } } - if (stat->nr_throttled > 0) c_num++; + + if (rec->nr_throttled > 0 && idx != -1) { + c_num++; + slot->last_update = ts; + } fclose(fp); } @@ -471,12 +538,21 @@ static int read_stat(struct sys_cputime_t** prev_sys, * because there is an error between process waked up and running, when * sched delay occur , the sum of cpu rates more than 100%. */ - sys_rec->cpu[i].usr = - (double)(now_sys[i]->usr - prev_sys[i]->usr) * 100 / all_time; - sys_rec->cpu[i].sys = - (double)(now_sys[i]->sys - prev_sys[i]->sys) * 100 / all_time; - sys_rec->cpu[i].iowait = - (double)(now_sys[i]->iowait - prev_sys[i]->iowait) * 100 / all_time; +#define CALC_SHARE(TIME_TYPE) \ + sys_rec->cpu[i].TIME_TYPE = \ + (double)(now_sys[i]->TIME_TYPE - prev_sys[i]->TIME_TYPE) * 100 / \ + all_time; + + CALC_SHARE(usr) + CALC_SHARE(nice) + CALC_SHARE(sys) + CALC_SHARE(idle) + CALC_SHARE(iowait) + CALC_SHARE(irq) + CALC_SHARE(softirq) + CALC_SHARE(steal) + CALC_SHARE(guest) + CALC_SHARE(guest_nice) } cleanup: if (fp) fclose(fp); @@ -766,15 +842,20 @@ static void output_per_cpu(struct record_t* rec, FILE* dest) { int i; struct sys_record_t* sys = &rec->sys; + // 18.8 us, 14.1 sy, 0.0 ni, 65.6 id, 0.0 wa, 0.0 hi, 1.6 si, 0.0 st fprintf(dest, "[PER-CPU]\n"); - fprintf(dest, "%7s %6s %6s %6s %10s\n", "cpu", "usr", "sys", "iowait", - "delay(ns)"); + fprintf(dest, "%7s %6s %6s %6s %6s %6s %6s %6s %6s %10s\n", "cpu", "usr", + "sys", "nice", "idle", "iowait", "h-irq", "s-irq", "steal", + "delay(ms)"); for (i = 1; i <= nr_cpu; i++) { char cpu_name[10]; snprintf(cpu_name, 10, "cpu-%d", i - 1); - fprintf(dest, "%7s %6.1f %6.1f %6.1f %10llu\n", cpu_name, - sys->cpu[i].usr, sys->cpu[i].sys, sys->cpu[i].iowait, - sys->percpu_sched_delay[i - 1]); + fprintf(dest, + "%7s %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %6.1f %10llu\n", + cpu_name, sys->cpu[i].usr, sys->cpu[i].sys, sys->cpu[i].nice, + sys->cpu[i].idle, sys->cpu[i].iowait, sys->cpu[i].irq, + sys->cpu[i].softirq, sys->cpu[i].steal, + sys->percpu_sched_delay[i - 1] / (1000 * 1000)); } } @@ -842,6 +923,7 @@ static void output_tasktop(struct record_t* rec, int rec_num, FILE* dest) { static void output_d_stack(struct record_t* rec, int d_num, FILE* dest) { int i; struct D_task_record_t* d_tasks = rec->d_tasks; + char* str = calloc(STACK_CONTENT_LEN, sizeof(char)); for (i = 0; i < d_num; i++) { if (i == 0) { fprintf(dest, "[D-STASK]\n"); @@ -851,7 +933,8 @@ static void output_d_stack(struct record_t* rec, int d_num, FILE* dest) { fprintf(dest, "%18s %6d %6d ", d_tasks[i].comm, d_tasks[i].tid, d_tasks[i].pid); - char* str = d_tasks[i].stack; + strncpy(str, d_tasks[i].stack, STACK_CONTENT_LEN); + const char delim[2] = "\n"; char* token; @@ -864,6 +947,150 @@ static void output_d_stack(struct record_t* rec, int d_num, FILE* dest) { fprintf(dest, "%18s %6s %6s %s\n", "", "", "", token); } } + + free(str); +} + +static bool inline is_high_load1(struct record_t* rec) { +#define THRESHOLD_LOAD1 nr_cpu * 1.5 + return rec->sys.load1 >= THRESHOLD_LOAD1; +} + +static bool inline is_high_R(struct record_t* rec) { +#define THRESHOLD_R nr_cpu + return rec->sys.nr_R >= THRESHOLD_R; +} + +static bool inline is_high_D(struct record_t* rec) { +#define THRESHOLD_D 8 + return rec->sys.nr_D >= THRESHOLD_D; +} + +// static bool fork_detect(struct record_t* rec, FILE* dest) { +// #define THRESHOLD_FORK_PS 2000 +// return rec->sys.nr_fork >= THRESHOLD_FORK_PS; +// } + +double inline calculate_sys(cpu_util_t* cpu) { + double sys_util = cpu->iowait + cpu->sys + cpu->softirq + cpu->irq; + return sys_util; +} + +double inline calculate_overall(cpu_util_t* cpu) { + double overall_cpuutil = cpu->iowait + cpu->sys + cpu->softirq + cpu->irq + + +cpu->nice + cpu->usr; + return overall_cpuutil; +} + +static void inline is_high_overall_cpuutil(struct record_t* rec, FILE* dest) { +#define THRESHOLD_CPU_OVERLOAD 55 + cpu_util_t* cpu = rec->sys.cpu; + double overall_cpuutil = calculate_overall(cpu); + if (overall_cpuutil >= THRESHOLD_CPU_OVERLOAD) { + fprintf(dest, "WARN: CPU overall utilization is high.\n"); + } +} + +static void inline is_high_sys(struct record_t* rec, FILE* dest) { +#define THRESHOLD_SYS 15 + double sys_util = calculate_sys(rec->sys.cpu); + if (sys_util >= THRESHOLD_SYS) { + fprintf(dest, "INFO: Sys time of cpu is high.\n"); + } +} + +static void is_bind(struct record_t* rec, FILE* dest) { +#define THRESHOLD_BIND 25 + int i; + cpu_util_t* cpu = rec->sys.cpu; + double min_overall = calculate_overall(cpu); + + for (i = 1; i <= nr_cpu; i++) { + double overall = calculate_overall(cpu + i); + if (overall < min_overall) min_overall = overall; + } + + bool status[nr_cpu]; + bool exist = false; + for (i = 1; i <= nr_cpu; i++) { + double overall = calculate_overall(cpu + i); + if (overall - min_overall > THRESHOLD_BIND) { + status[i - 1] = true; + exist = true; + } else { + status[i - 1] = false; + } + } + + if (exist) { + fprintf(dest, "WARN: Some tasks bind cpu. Please check cpu: "); + for (i = 1; i <= nr_cpu; i++) { + if (status[i - 1]) { + fprintf(dest, " [%d]", i - 1); + } + } + fprintf(dest, "\n"); + } +} + +static void group_by_stack(struct record_t* rec, FILE* dest, int d_num) { +#define PREFIX_LEN 64 + D_task_record_t* dtask = rec->d_tasks; + int* counter = calloc(d_num, sizeof(int)); + D_task_record_t** stack = calloc(d_num, sizeof(D_task_record_t*)); + + int empty_slot = 0; + int i, j; + for (i = 0; i < d_num; i++) { + D_task_record_t* s = dtask + i; + bool match = false; + for (j = 0; j < empty_slot; j++) { + if (stack[j]) { + if (!strncmp(stack[j]->stack, s->stack, 64)) { + counter[j]++; + match = true; + break; + } + } + } + + if (!match) { + stack[empty_slot++] = s; + } + } + + int max_idx = 0; + int max_times = 0; + for (i = 0; i < empty_slot; i++) { + if (counter[i] > max_times) { + max_times = counter[i]; + max_idx = i; + } + } + + fprintf(dest, "WARN: The most stack, times=%d\n", max_times); + fprintf(dest, "%s", stack[max_idx]->stack); + free(stack); + free(counter); +} + +static void load_analyse(struct record_t* rec, int rec_num, FILE* dest, + int d_num, int cgroup_num) { + if (is_high_load1(rec)) { + if (is_high_R(rec)) { + is_high_overall_cpuutil(rec, dest); + + is_high_sys(rec, dest); + + is_bind(rec, dest); + } + + if (is_high_D(rec)) { + group_by_stack(rec, dest, d_num); + } + } else { + fprintf(dest, "INFO: Load is normal.\n"); + } } static void output(struct record_t* rec, int rec_num, FILE* dest, int d_num, @@ -875,6 +1102,7 @@ static void output(struct record_t* rec, int rec_num, FILE* dest, int d_num, output_tasktop(rec, rec_num, dest); output_d_stack(rec, d_num, dest); + load_analyse(rec, rec_num, dest, d_num, cgroup_num); fflush(dest); } @@ -928,7 +1156,8 @@ static int make_records(struct id_pair_t* pids, int nr_thread, } } - /* many pair with the same pid, in process mode skip the trival read */ + /* many pair with the same pid, in process mode skip the trival read + */ if (!env.thread_mode && id->pid != id->tid) continue; if (env.thread_mode) { @@ -1023,6 +1252,7 @@ int main(int argc, char** argv) { struct tasktop_bpf* skel = 0; struct id_pair_t* pids = 0; struct task_cputime_t **prev_task = 0, **now_task = 0; + struct cgroup_cpu_stat_t** prev_cgroup = 0; struct sys_cputime_t **prev_sys = 0, **now_sys = 0; struct record_t* rec = 0; u_int64_t i; @@ -1076,6 +1306,8 @@ int main(int argc, char** argv) { goto cleanup; } + prev_cgroup = calloc(env.cgroup_limit, sizeof(struct cgroup_cpu_stat_t*)); + prev_delay = calloc(nr_cpu, sizeof(int)); pids = calloc(pidmax + 1, sizeof(struct id_pair_t)); prev_task = calloc(pidmax + 1, sizeof(struct task_cputime_t*)); @@ -1131,7 +1363,7 @@ int main(int argc, char** argv) { int cgroup_num = 0; #ifndef ONLY_THREAD - read_cgroup_throttle(rec->cgroups, &cgroup_num); + read_cgroup_throttle(rec->cgroups, &cgroup_num, prev_cgroup); read_sched_delay(&rec->sys); check_fork(fork_map_fd, &rec->sys); runnable_proc(&rec->sys); diff --git a/source/tools/detect/sched/tasktop/tasktop.h b/source/tools/detect/sched/tasktop/tasktop.h index a90d9c1ad3e55c2b87f3c8bb7892456a72577658..f9493c00d48de954c52519d3fb4f9f24258b726f 100644 --- a/source/tools/detect/sched/tasktop/tasktop.h +++ b/source/tools/detect/sched/tasktop/tasktop.h @@ -96,12 +96,20 @@ typedef struct cgroup_cpu_stat_t { unsigned long long current_bw; int nr_burst; unsigned long long burst_time; + time_t last_update; } cgroup_cpu_stat_t; typedef struct cpu_util_t { double usr; + double nice; double sys; + double idle; double iowait; + double irq; + double softirq; + double steal; + double guest; + double guest_nice; } cpu_util_t; typedef struct sys_record_t { @@ -125,4 +133,4 @@ struct record_t { sys_record_t sys; }; -#endif \ No newline at end of file +#endif diff --git a/source/tools/detect/sched/tasktop/tasktopSelftest/test.c b/source/tools/detect/sched/tasktop/tasktopSelftest/test.c index 9be766ef50fabd6bada12c1607b87b337c6b08fe..c6bcc03e93a47658c4efed9eb7d4f54a2badc9d1 100644 --- a/source/tools/detect/sched/tasktop/tasktopSelftest/test.c +++ b/source/tools/detect/sched/tasktop/tasktopSelftest/test.c @@ -31,14 +31,15 @@ void *run_forever(void *arg) { void *run_sleep(void *arg) { sleep(10000); } void run_multithread() { - pthread_t pid[128]; + #define NUM 128 + pthread_t pid[NUM]; int i; - for (i = 0; i < 128; i++) { + for (i = 0; i < NUM; i++) { pthread_create(&pid[i], 0, run_forever, 0); // printf("fork.\n"); } - for (i = 0; i < 128; i++) { + for (i = 0; i < NUM; i++) { pthread_join(pid[i], 0); } } diff --git a/source/tools/detect/sched/tasktop/tasktopSelftest/test.md b/source/tools/detect/sched/tasktop/tasktopSelftest/test.md index 07d748566f7b8804845bd6c960fb3cf70e60e31a..6c18dd7472f8039f1d9dbaf8f4a38cfb04d41ad5 100644 --- a/source/tools/detect/sched/tasktop/tasktopSelftest/test.md +++ b/source/tools/detect/sched/tasktop/tasktopSelftest/test.md @@ -47,23 +47,52 @@ #### 2.1.2 测试结果 - [TIME-STAMP] 2023-05-30 01:37:05 + [TIME-STAMP] 2023-06-06 02:59:25 [UTIL&LOAD] usr sys iowait load1 R D fork : proc - 96.4 3.5 0.0 63.2 71 1 124 : logagentctl.sh(33155) ppid=33152 cnt=6 - [PER_CPU] - cpu usr sys iowait delay(ns) - cpu-0 95.6 4.4 0.0 71161526462 - cpu-1 95.3 4.4 0.0 63930165092 - cpu-2 97.8 2.2 0.0 59050351997 - cpu-3 96.7 3.0 0.0 61105725553 + 94.5 5.5 0.0 56.7 72 1 203 : cpuUsage.sh(56709) ppid=71957 cnt=15 + [PER-CPU] + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 92.6 7.4 0.0 0.0 0.0 0.0 0.0 0.0 74125 + cpu-1 93.4 6.4 0.0 0.0 0.0 0.0 0.3 0.0 72258 + cpu-2 97.2 2.8 0.0 0.0 0.0 0.0 0.0 0.0 68874 + cpu-3 94.7 5.3 0.0 0.0 0.0 0.0 0.0 0.0 68119 [TASKTOP] COMMAND PID PPID START RUN %UTIME %STIME %CPU - (stress) 32171 32170 1685410602 23 7.0 0.0 7.0 - (stress) 32172 32170 1685410602 23 7.0 0.0 7.0 - (stress) 32173 32170 1685410602 23 7.0 0.0 7.0 - (stress) 32174 32170 1685410602 23 7.0 0.0 7.0 - (stress) 32175 32170 1685410602 23 7.0 0.0 7.0 + (cpptools) 72168 72010 1686014764 5601 9.7 0.3 10.0 + (AliYunDun) 26698 1 1685936915 83450 5.3 2.7 8.0 + (stress) 54005 53998 1686020277 88 7.3 0.0 7.3 + (stress) 54011 53998 1686020277 88 7.3 0.0 7.3 + ................ + (stress) 54062 53998 1686020277 88 6.7 0.0 6.7 + (stress) 54037 53998 1686020277 88 6.7 0.0 6.7 + (stress) 54051 53998 1686020277 88 6.3 0.0 6.3 + (telegraf) 38654 1 1684980656 1039709 4.0 2.0 6.0 + (node) 72010 71892 1686014761 5604 2.3 2.0 4.3 + (uniagent) 39728 1 1684980670 1039695 1.3 0.3 1.7 + (tasktop) 56641 56640 1686020359 6 0.3 1.3 1.7 + (argusagent) 39850 1 1684980673 1039692 0.7 0.7 1.3 + (node) 71957 71892 1686014761 5604 0.3 0.7 1.0 + (node) 71892 71871 1686014761 5604 0.7 0.0 0.7 + (systemd) 1 0 1684918982 1101383 0.3 0.3 0.7 + (staragentd) 40538 1 1684980691 1039674 0.0 0.3 0.3 + (rcu_sched) 10 2 1684918982 1101383 0.0 0.3 0.3 + (ilogtail) 38538 1620 1684980652 1039713 0.3 0.0 0.3 + (syslog-ng) 38653 1 1684980656 1039709 0.3 0.0 0.3 + (dbus-daemon) 1125 1 1684918989 1101376 0.3 0.0 0.3 + (systemd-journalS 38655 1 1684980656 1039709 0.3 0.0 0.3 + (sshd) 71789 71756 1686014760 5605 0.0 0.3 0.3 + (logagent) 39295 1 1684980663 1039702 0.3 0.0 0.3 + (dockerd) 1426 1 1684918991 1101374 0.3 0.0 0.3 + (AliYunDunUpdateS 1793 1 1684918991 1101374 0.3 0.0 0.3 + (walle) 40189 1 1684980677 1039688 0.3 0.0 0.3 + [D-STASK] + COMMAND PID PPID STACK + (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 + [<0>] kthread+0xf5/0x130 + [<0>] ret_from_fork+0x1f/0x30 + [<0>] 0xffffffffffffffff + WARN: CPU overall utilization is high. 观察到load1迅速冲高,伴随系统以及per-cpu的cpu利用率打满,cpu时间集中于用户态,per-cpu的调度延迟达到60s。 @@ -81,26 +110,57 @@ #### 2.2.2 测试结果 - [TIME-STAMP] 2023-05-30 01:35:43 + [TIME-STAMP] 2023-06-06 03:00:54 [UTIL&LOAD] usr sys iowait load1 R D fork : proc - 32.9 4.0 0.0 65.2 64 1 114 : cpuUsage.sh(30412) ppid=1755 cnt=15 - [PER_CPU] - cpu usr sys iowait delay(ns) - cpu-0 100.0 0.0 0.0 191098577265 - cpu-1 10.0 4.3 0.0 217842891 - cpu-2 11.0 6.6 0.0 152909467 - cpu-3 10.0 5.0 0.0 212943380 + 28.5 3.1 0.0 67.7 64 2 80 : logagentctl.sh(59611) ppid=59608 cnt=6 + [PER-CPU] + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 100.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 191516 + cpu-1 3.3 2.6 0.0 91.7 0.0 0.0 2.3 0.0 91 + cpu-2 4.3 4.0 0.0 91.6 0.0 0.0 0.0 0.0 154 + cpu-3 5.0 6.0 0.0 89.0 0.0 0.0 0.0 0.0 129 [TASKTOP] - COMMAND PID PPID START RUN %UTIME %STIME %CPU - (node) 1816 1710 1685409611 932 16.3 1.3 17.7 - (AliYunDun) 27299 1 1685004504 406039 3.0 2.0 5.0 - (node) 1710 1697 1685409611 932 2.3 0.7 3.0 - (telegraf) 38654 1 1684980656 429887 1.7 1.3 3.0 - (test) 20079 20076 1685410217 326 1.7 0.0 1.7 - (test) 20080 20076 1685410217 326 1.7 0.0 1.7 - (test) 20081 20076 1685410217 326 1.7 0.0 1.7 - (test) 20082 20076 1685410217 326 1.7 0.0 1.7 + COMMAND PID PPID START RUN %UTIME %STIME %CPU + (AliYunDun) 26698 1 1685936915 83539 2.3 1.7 4.0 + (telegraf) 38654 1 1684980656 1039798 1.3 2.0 3.3 + (node) 72010 71892 1686014761 5693 2.0 0.7 2.7 + (test) 58683 58682 1686020436 18 1.7 0.0 1.7 + (test) 58684 58682 1686020436 18 1.7 0.0 1.7 + ........... + (test) 58744 58682 1686020436 18 1.7 0.0 1.7 + (test) 58745 58682 1686020436 18 1.7 0.0 1.7 + (test) 58746 58682 1686020436 18 1.7 0.0 1.7 + (tasktop) 56641 56640 1686020359 95 0.3 1.3 1.7 + (test) 58725 58682 1686020436 18 1.3 0.0 1.3 + (test) 58702 58682 1686020436 18 1.3 0.0 1.3 + (uniagent) 39728 1 1684980670 1039784 1.0 0.3 1.3 + (test) 58696 58682 1686020436 18 1.3 0.0 1.3 + (test) 58697 58682 1686020436 18 1.3 0.0 1.3 + ............. + (test) 58722 58682 1686020436 18 1.3 0.0 1.3 + (test) 58723 58682 1686020436 18 1.3 0.0 1.3 + (test) 58685 58682 1686020436 18 1.3 0.0 1.3 + (dockerd) 1426 1 1684918991 1101463 1.0 0.0 1.0 + (staragentd) 40538 1 1684980691 1039763 0.3 0.7 1.0 + (sshd) 71789 71756 1686014760 5694 0.3 0.3 0.7 + (argusagent) 39850 1 1684980673 1039781 0.3 0.3 0.7 + (systemd-journalS 38655 1 1684980656 1039798 0.3 0.0 0.3 + (systemd-logind)S 1159 1 1684918989 1101465 0.0 0.3 0.3 + (ilogtail) 38538 1620 1684980652 1039802 0.0 0.3 0.3 + (walle) 40189 1 1684980677 1039777 0.3 0.0 0.3 + (syslog-ng) 38653 1 1684980656 1039798 0.0 0.3 0.3 + (node) 71892 71871 1686014761 5693 0.3 0.0 0.3 + (node) 71957 71892 1686014761 5693 0.0 0.3 0.3 + (dbus-daemon) 1125 1 1684918989 1101465 0.3 0.0 0.3 + [D-STASK] + COMMAND PID PPID STACK + (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 + [<0>] kthread+0xf5/0x130 + [<0>] ret_from_fork+0x1f/0x30 + [<0>] 0xffffffffffffffff + WARN: Some tasks bind cpu. Please check cpu: [0] + 观察到load1冲高,伴随有R状态进程数增多,但系统cpu利用率不高,cpu-0的利用率打满,cpu-0的调度延迟达到190s ### 2.3 大量fork场景 @@ -117,50 +177,49 @@ #### 2.3.2 测试结果 - [TIME-STAMP] 2023-05-30 01:38:51 + [TIME-STAMP] 2023-06-06 03:02:38 [UTIL&LOAD] - usr sys iowait load1 R D fork : proc - 60.5 33.6 0.0 49.6 78 1 16304 : test(34338) ppid=1893 cnt=16230 - [PER_CPU] - cpu usr sys iowait delay(ns) - cpu-0 58.2 35.3 0.0 24573513865 - cpu-1 60.5 34.0 0.0 24758192280 - cpu-2 60.9 33.2 0.0 24491136783 - cpu-3 62.3 31.8 0.0 24525447957 + usr sys iowait load1 R D fork : proc + 57.7 36.0 0.0 57.0 72 1 14614 : test(62171) ppid=49997 cnt=14548 + [PER-CPU] + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 58.9 33.7 0.0 6.8 0.0 0.0 0.3 0.3 24100 + cpu-1 61.4 33.1 0.0 5.1 0.0 0.0 0.3 0.0 24844 + cpu-2 56.0 37.5 0.0 6.5 0.0 0.0 0.0 0.0 24961 + cpu-3 54.5 39.3 0.0 6.2 0.0 0.0 0.0 0.0 25138 [TASKTOP] - COMMAND PID PPID START RUN %UTIME %STIME %CPU - (test) 34338 1893 1685410675 56 1.3 41.0 42.3 - (telegraf) 38654 1 1684980656 430075 1.7 1.7 3.3 - (node) 1816 1710 1685409611 1120 1.7 1.0 2.7 - (tasktop) 29995 29987 1685410527 204 0.0 1.3 1.3 - (uniagent) 39728 1 1684980670 430061 1.0 0.3 1.3 - (kworker/3:3-eveIts) 50231 2 1685410706 25 0.0 1.3 1.3 - (kworker/2:0-eveIts) 27079 2 1685410443 288 0.0 1.0 1.0 - (dockerd) 1426 1 1684918991 491740 1.0 0.0 1.0 - (kworker/1:2-eveIts) 10237 2 1685409893 838 0.0 1.0 1.0 - (node) 1710 1697 1685409611 1120 0.7 0.0 0.7 - (kworker/0:2-eveIts) 32158 2 1685410601 130 0.0 0.7 0.7 - (argusagent) 39850 1 1684980673 430058 0.3 0.3 0.7 - (staragentd) 40538 1 1684980691 430040 0.0 0.7 0.7 - (systemd) 1 0 1684918982 491749 0.3 0.3 0.7 - (ilogtail) 38538 1620 1684980652 430079 0.0 0.3 0.3 - (systemd-logind)S 1159 1 1684918989 491742 0.0 0.3 0.3 - (systemd-journalS 38655 1 1684980656 430075 0.0 0.3 0.3 - (logagent) 39295 1 1684980663 430068 0.0 0.3 0.3 - (rcu_sched) 10 2 1684918982 491749 0.0 0.3 0.3 - (ksoftirqd/2) 21 2 1684918982 491749 0.0 0.3 0.3 - (walle) 40189 1 1684980677 430054 0.3 0.0 0.3 - (java) 40355 1 1684980680 430051 0.3 0.0 0.3 - (node) 1755 1710 1685409611 1120 0.0 0.3 0.3 - (docker) 43348 43343 1684980759 429972 0.0 0.3 0.3 - (dbus-daemon) 1125 1 1684918989 491742 0.3 0.0 0.3 + COMMAND PID PPID START RUN %UTIME %STIME %CPU + (AliYunDun) 26698 1 1685936915 83643 39.0 15.3 54.3 + (test) 62171 49997 1686020542 16 1.7 40.0 41.7 + (telegraf) 38654 1 1684980656 1039902 2.0 1.7 3.7 + (node) 72010 71892 1686014761 5797 2.0 1.3 3.3 + (uniagent) 39728 1 1684980670 1039888 2.0 0.0 2.0 + (tasktop) 56641 56640 1686020359 199 0.3 1.3 1.7 + (argusagent) 39850 1 1684980673 1039885 0.7 0.7 1.3 + (kworker/3:3-eveIts) 56014 2 1686020335 223 0.0 1.0 1.0 + (kworker/0:0-eveIts) 36226 2 1686019771 787 0.0 1.0 1.0 + (kworker/1:0-eveIts) 34141 2 1686019712 846 0.0 1.0 1.0 + (walle) 40189 1 1684980677 1039881 1.0 0.0 1.0 + (ilogtail) 38538 1620 1684980652 1039906 0.3 0.3 0.7 + (kworker/2:0-eveIts) 99018 2 1686020550 8 0.0 0.7 0.7 + (java) 2234 1 1685929891 90667 0.3 0.0 0.3 + (staragentd) 40538 1 1684980691 1039867 0.0 0.3 0.3 + (rcu_sched) 10 2 1684918982 1101576 0.0 0.3 0.3 + (logagent) 39295 1 1684980663 1039895 0.3 0.0 0.3 + (logagent-collecS 39368 39295 1684980663 1039895 0.3 0.0 0.3 + (sshd) 71789 71756 1686014760 5798 0.0 0.3 0.3 + (node) 71892 71871 1686014761 5797 0.3 0.0 0.3 + (node) 71957 71892 1686014761 5797 0.3 0.0 0.3 + (ksoftirqd/2) 21 2 1684918982 1101576 0.0 0.3 0.3 + (AliYunDunUpdateS 1793 1 1684918991 1101567 0.0 0.3 0.3 [D-STASK] - COMMAND PID PPID STACK - (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 - [<0>] load_calc_func+0x57/0x130 + COMMAND PID PPID STACK + (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x1f/0x30 [<0>] 0xffffffffffffffff + WARN: CPU overall utilization is high. + INFO: Sys time of cpu is high. 观察到load增高,同时CPU利用率也跑满,存在较多R进程但是没有被top捕捉到。此时fork增量激增,fork调用次数最多的进程为test,同时test进程的sys利用率较高。 @@ -184,45 +243,46 @@ #### 2.4.2 测试结果 - [TIME-STAMP] 2023-05-30 02:07:05 + [TIME-STAMP] 2023-06-06 03:05:57 [UTIL&LOAD] usr sys iowait load1 R D fork : proc - 11.9 3.3 0.0 3.0 0 1 97 : logagentctl.sh(51049) ppid=51047 cnt=6 - [PER_CPU] - cpu usr sys iowait delay(ns) - cpu-0 11.6 3.0 0.0 320221229182 - cpu-1 12.3 3.3 0.0 2954563205 - cpu-2 12.2 3.3 0.0 5919534403 - cpu-3 11.8 3.9 0.0 30688036677 + 11.4 2.5 0.0 5.8 1 1 93 : hostinfo(59107) ppid=40018 cnt=8 + [PER-CPU] + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 10.9 2.3 0.0 86.8 0.0 0.0 0.0 0.0 115336 + cpu-1 11.8 3.0 0.0 84.9 0.0 0.0 0.3 0.0 112956 + cpu-2 11.4 2.6 0.0 85.7 0.0 0.0 0.0 0.3 76902 + cpu-3 11.8 2.6 0.0 85.3 0.0 0.0 0.0 0.3 76995 [CGROUP] cgroup_name nr_periods nr_throttled throttled_time nr_burst burst_time - stress_cg 2531 2523 928824820358 0 0 + stress_cg 31 31 11437051623 0 0 [TASKTOP] - COMMAND PID PPID START RUN %UTIME %STIME %CPU - (test) 42327 1893 1685412142 283 30.0 0.0 30.0 - (AliYunDun) 7999 1 1685411022 1403 1.7 1.7 3.3 - (uniagent) 39728 1 1684980670 431755 2.0 1.0 3.0 - (telegraf) 38654 1 1684980656 431769 1.7 1.0 2.7 - (node) 1816 1710 1685409611 2814 1.7 0.7 2.3 - (tasktop) 23209 23200 1685411589 836 0.3 1.0 1.3 - (node) 1710 1697 1685409611 2814 0.7 0.3 1.0 - (walle) 40189 1 1684980677 431748 0.7 0.0 0.7 - (AliYunDunUpdateS 1793 1 1684918991 493434 0.3 0.0 0.3 - (systemd-journalS 38655 1 1684980656 431769 0.3 0.0 0.3 - (logagent) 39295 1 1684980663 431762 0.3 0.0 0.3 - (top) 27389 27304 1685411726 699 0.3 0.0 0.3 - (argusagent) 39850 1 1684980673 431752 0.3 0.0 0.3 - (ilogtail) 38538 1620 1684980652 431773 0.3 0.0 0.3 - (staragentd) 40538 1 1684980691 431734 0.0 0.3 0.3 - (dfget) 41456 1 1685412118 307 0.0 0.3 0.3 - (docker-containeS 38593 1426 1684980653 431772 0.3 0.0 0.3 + COMMAND PID PPID START RUN %UTIME %STIME %CPU + (test) 55827 41984 1686020648 109 30.0 0.0 30.0 + (telegraf) 38654 1 1684980656 1040101 3.3 1.3 4.7 + (uniagent) 39728 1 1684980670 1040087 2.0 0.3 2.3 + (node) 72010 71892 1686014761 5996 1.3 1.0 2.3 + (tasktop) 56641 56640 1686020359 398 0.3 1.3 1.7 + (staragentd) 40538 1 1684980691 1040066 0.0 0.7 0.7 + (dbus-daemon) 1125 1 1684918989 1101768 0.3 0.3 0.7 + (systemd-logind)S 1159 1 1684918989 1101768 0.3 0.3 0.7 + (node) 71892 71871 1686014761 5996 0.7 0.0 0.7 + (walle) 40189 1 1684980677 1040080 0.3 0.3 0.7 + (chronyd) 3259 1 1684919007 1101750 0.0 0.3 0.3 + (java) 40355 1 1684980680 1040077 0.0 0.3 0.3 + (rcu_sched) 10 2 1684918982 1101775 0.0 0.3 0.3 + (java) 41200 1 1684980696 1040061 0.3 0.0 0.3 + (systemd-journalS 38655 1 1684980656 1040101 0.0 0.3 0.3 + (logagent-collecS 39368 39295 1684980663 1040094 0.3 0.0 0.3 + (java) 2234 1 1685929891 90866 0.0 0.3 0.3 + (argusagent) 39850 1 1684980673 1040084 0.0 0.3 0.3 [D-STASK] - COMMAND PID PPID STACK - (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 - [<0>] load_calc_func+0x57/0x130 + COMMAND PID PPID STACK + (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x1f/0x30 [<0>] 0xffffffffffffffff + INFO: Load is normal. 可以观察到此时虽然**实际负载**很高,大量task由于限流处于R状态,但是由于cgroup机制task并不位于就绪队列中,因此R状态数量指标不准确导致load1计算不准(load1无法准确体现出系统的负载情况)。但是在cgroup限流信息中可以看到stress_cg中**出现了大量的限流**,并且**per-cpu的调度延迟很高**,一定程度体现了cpu就绪队列中存在task堆积。 @@ -238,91 +298,92 @@ #### 2.5.2 测试结果 - [TIME-STAMP] 2023-05-30 02:37:38 + [TIME-STAMP] 2023-06-06 03:06:55 [UTIL&LOAD] usr sys iowait load1 R D fork : proc - 5.2 72.4 17.1 15.5 42 31 76 : walle-plugin-no(34901) ppid=40189 cnt=6 - [PER_CPU] - cpu usr sys iowait delay(ns) - cpu-0 5.0 72.2 17.1 16162101481 - cpu-1 6.0 70.4 17.9 13251662817 - cpu-2 3.5 76.8 16.5 20181731711 - cpu-3 5.6 71.2 17.2 16403953008 + 12.0 67.2 16.0 75.7 37 65 96 : walle(40192) ppid=1 cnt=6 + [PER-CPU] + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 13.5 64.3 0.0 4.8 17.4 0.0 0.0 0.0 16684 + cpu-1 12.0 66.0 0.0 5.5 16.5 0.0 0.0 0.0 250220 + cpu-2 10.4 72.8 0.0 3.4 13.4 0.0 0.0 0.0 100659 + cpu-3 12.3 66.0 0.0 5.2 16.5 0.0 0.0 0.0 94436 [CGROUP] cgroup_name nr_periods nr_throttled throttled_time nr_burst burst_time - stress_cg 7811 7801 2868506676781 0 0 + stress_cg 31 31 9558428752 0 0 [TASKTOP] COMMAND PID PPID START RUN %UTIME %STIME %CPU - (telegraf) 38654 1 1684980656 433602 3.0 2.0 5.0 - (stress) 34147 34109 1685414245 13 0.0 4.7 4.7 - (stress) 34151 34109 1685414245 13 0.0 4.7 4.7 - (stress) 34159 34109 1685414245 13 0.0 4.7 4.7 - ... - (stress) 34125 34109 1685414245 13 0.0 3.7 3.7 - (stress) 34132 34109 1685414245 13 0.0 3.7 3.7 - (node) 1710 1697 1685409611 4647 2.7 0.3 3.0 - (tasktop) 23209 23200 1685411589 2669 0.3 1.0 1.3 - (jbd2/vda2-8) 606 2 1684918988 495270 0.0 1.3 1.3 - (uniagent) 39728 1 1684980670 433588 1.3 0.0 1.3 - (walle) 40189 1 1684980677 433581 1.3 0.0 1.3 - (argusagent) 39850 1 1684980673 433585 0.3 0.7 1.0 - (staragentd) 40538 1 1684980691 433567 0.3 0.7 1.0 - (ilogtail) 38538 1620 1684980652 433606 0.3 0.3 0.7 - (kworker/u8:2-flIsh- 26679 2 1685413999 259 0.0 0.3 0.3 - (kworker/u8:3-flIsh- 28798 2 1685414070 188 0.0 0.3 0.3 - (systemd-logind)S 1159 1 1684918989 495269 0.3 0.0 0.3 - (dockerd) 1426 1 1684918991 495267 0.3 0.0 0.3 - (sshd) 1609 1589 1685409610 4648 0.0 0.3 0.3 - (kworker/0:1H-kbIock 592 2 1684918983 495275 0.0 0.3 0.3 - (node) 1755 1710 1685409611 4647 0.0 0.3 0.3 - (docker-containeS 38593 1426 1684980653 433605 0.3 0.0 0.3 - (kworker/3:1H-kbIock 599 2 1684918983 495275 0.0 0.3 0.3 - (systemd-journalS 38655 1 1684980656 433602 0.0 0.3 0.3 - (logagent-collecS 39369 39295 1684980663 433595 0.3 0.0 0.3 - (kworker/u8:1-flIsh- 6901 2 1685410987 3271 0.0 0.3 0.3 - (node) 13915 1816 1685410008 4250 0.3 0.0 0.3 - (systemd) 1 0 1684918982 495276 0.3 0.0 0.3 - (dbus-daemon) 1125 1 1684918989 495269 0.3 0.0 0.3 - (kworker/u8:4-flIsh- 66011 2 1685412917 1341 0.0 0.3 0.3 + (test) 55827 41984 1686020648 167 29.3 1.7 31.0 + (telegraf) 38654 1 1684980656 1040159 3.0 2.3 5.3 + (kworker/2:1H-kbIock 644 2 1684918988 1101827 0.0 4.7 4.7 + (AliYunDun) 59678 1 1686020777 38 2.3 1.7 4.0 + (stress) 60016 60015 1686020788 27 0.0 4.0 4.0 + (stress) 60017 60015 1686020788 27 0.0 4.0 4.0 + (stress) 60018 60015 1686020788 27 0.0 4.0 4.0 + .................. + (stress) 60076 60015 1686020788 27 0.0 3.7 3.7 + (stress) 60033 60015 1686020788 27 0.0 3.7 3.7 + (stress) 60078 60015 1686020788 27 0.0 3.7 3.7 + (stress) 60079 60015 1686020788 27 0.0 3.7 3.7 + (tasktop) 56641 56640 1686020359 456 0.0 1.7 1.7 + (uniagent) 39728 1 1684980670 1040145 1.3 0.3 1.7 + (jbd2/vda2-8) 606 2 1684918988 1101827 0.0 1.7 1.7 + (node) 72010 71892 1686014761 6054 1.3 0.3 1.7 + (node) 71892 71871 1686014761 6054 1.0 0.0 1.0 + (dockerd) 1426 1 1684918991 1101824 1.0 0.0 1.0 + (staragentd) 40538 1 1684980691 1040124 0.3 0.3 0.7 + (kworker/u8:0-flIsh- 55249 2 1686020629 186 0.0 0.3 0.3 + (systemd-journalS 38655 1 1684980656 1040159 0.3 0.0 0.3 + (systemd) 1 0 1684918982 1101833 0.0 0.3 0.3 + (argusagent) 39850 1 1684980673 1040142 0.3 0.0 0.3 + (dbus-daemon) 1125 1 1684918989 1101826 0.3 0.0 0.3 + (java) 41200 1 1684980696 1040119 0.0 0.3 0.3 + (node) 71957 71892 1686014761 6054 0.3 0.0 0.3 + (kworker/u8:3-flIsh- 46817 2 1686020071 744 0.0 0.3 0.3 [D-STASK] COMMAND PID PPID STACK (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 - [<0>] load_calc_func+0x57/0x130 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x1f/0x30 [<0>] 0xffffffffffffffff (jbd2/vda2-8) 606 606 [<0>] jbd2_journal_commit_transaction+0x1356/0x1b60 [jbd2] - [<0>] jbd2_journal_commit_transaction+0x1356/0x1b60 [jbd2] [<0>] kjournald2+0xc5/0x260 [jbd2] [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x1f/0x30 [<0>] 0xffffffffffffffff - (stress) 34110 34110 [<0>] submit_bio_wait+0x84/0xc0 - [<0>] submit_bio_wait+0x84/0xc0 + (stress) 60022 60022 [<0>] submit_bio_wait+0x84/0xc0 [<0>] blkdev_issue_flush+0x7c/0xb0 [<0>] ext4_sync_fs+0x158/0x1e0 [ext4] - [<0>] sync_filesystem+0x6e/0x90 - [<0>] ovl_sync_fs+0x36/0x50 [overlay] [<0>] iterate_supers+0xb3/0x100 [<0>] ksys_sync+0x60/0xb0 [<0>] __ia32_sys_sync+0xa/0x10 [<0>] do_syscall_64+0x55/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0>] 0xffffffffffffffff - ... - (stress) 34131 34131 [<0>] submit_bio_wait+0x84/0xc0 - [<0>] submit_bio_wait+0x84/0xc0 + ................... + (stress) 60042 60042 [<0>] submit_bio_wait+0x84/0xc0 [<0>] blkdev_issue_flush+0x7c/0xb0 [<0>] ext4_sync_fs+0x158/0x1e0 [ext4] - [<0>] sync_filesystem+0x6e/0x90 - [<0>] ovl_sync_fs+0x36/0x50 [overlay] [<0>] iterate_supers+0xb3/0x100 [<0>] ksys_sync+0x60/0xb0 [<0>] __ia32_sys_sync+0xa/0x10 [<0>] do_syscall_64+0x55/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0>] 0xffffffffffffffff -可以看到当前存在大量D状态task,并且系统的sys较高,并且抓取到了D-task的内核栈,主要是由stress进程在进行IO导致。 + WARN: CPU overall utilization is high. + INFO: Sys time of cpu is high. + WARN: The most stack, times=17 + [<0>] submit_bio_wait+0x84/0xc0 + [<0>] blkdev_issue_flush+0x7c/0xb0 + [<0>] ext4_sync_fs+0x158/0x1e0 [ext4] + [<0>] iterate_supers+0xb3/0x100 + [<0>] ksys_sync+0x60/0xb0 + [<0>] __ia32_sys_sync+0xa/0x10 + [<0>] do_syscall_64+0x55/0x1a0 + [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [<0>] 0xffffffffffffffff + +可以看到当前存在大量D状态task,并且系统的sys较高,并且抓取到了D-task的内核栈,主要是由stress进程在进行IO导致。可以看到聚合的stack信息,大量的task阻塞在submit_bio_wait。 ### 2.6 模拟锁竞争场景 @@ -341,92 +402,44 @@ D状态在内核中出现的路径很多,锁的竞争是其中的一种情况 #### 2.6.2 测试结果 - [TIME-STAMP] 2023-06-01 01:58:49 + [TIME-STAMP] 2023-06-06 03:12:29 [UTIL&LOAD] usr sys iowait load1 R D fork : proc - 0.0 20.0 0.0 4.2 0 10 0 : logagentctl.sh(28502) ppid=28501 cnt=5 + 4.0 3.7 0.0 10.1 0 11 120 : bash(71118) ppid=41984 cnt=10 [PER-CPU] - cpu usr sys iowait delay(ns) - cpu-0 0.0 0.0 0.0 110692 - cpu-1 0.0 0.0 0.0 170744 - cpu-2 0.0 66.7 0.0 81461 - cpu-3 0.0 0.0 0.0 148981 - [CGROUP] - cgroup_name nr_periods nr_throttled throttled_time nr_burst burst_time - stress_cg 7811 7801 2868506676781 0 0 + cpu usr sys nice idle iowait h-irq s-irq steal delay(ms) + cpu-0 4.0 3.6 0.0 92.1 0.0 0.0 0.3 0.0 65 + cpu-1 4.3 3.7 0.0 92.0 0.0 0.0 0.0 0.0 58 + cpu-2 3.7 4.7 0.0 91.7 0.0 0.0 0.0 0.0 67 + cpu-3 4.0 2.7 0.0 93.0 0.0 0.0 0.0 0.3 60 [TASKTOP] COMMAND PID PPID START RUN %UTIME %STIME %CPU - (tasktop) 22504 22502 1685584509 220 0.3 0.7 1.0 + (AliYunDun) 59678 1 1686020777 372 2.7 2.0 4.7 + (telegraf) 38654 1 1684980656 1040493 1.0 2.3 3.3 + (uniagent) 39728 1 1684980670 1040479 1.7 1.0 2.7 + (node) 71892 71871 1686014761 6388 1.7 0.3 2.0 + (node) 72010 71892 1686014761 6388 1.3 0.3 1.7 + (tasktop) 56641 56640 1686020359 790 0.3 1.0 1.3 + (node) 71957 71892 1686014761 6388 0.3 0.7 1.0 + (walle) 40189 1 1684980677 1040472 0.7 0.3 1.0 + (staragentd) 40538 1 1684980691 1040458 0.3 0.3 0.7 + (ilogtail) 38538 1620 1684980652 1040497 0.3 0.0 0.3 + (java) 40355 1 1684980680 1040469 0.3 0.0 0.3 + (syslog-ng) 38653 1 1684980656 1040493 0.3 0.0 0.3 + (bash) 41984 71957 1686019937 1212 0.3 0.0 0.3 + (systemd) 1 0 1684918982 1102167 0.3 0.0 0.3 + (systemd-journalS 38655 1 1684980656 1040493 0.0 0.3 0.3 + (sshd) 71789 71756 1686014760 6389 0.0 0.3 0.3 + (logagent) 39295 1 1684980663 1040486 0.3 0.0 0.3 + (AliYunDunUpdateS 1793 1 1684918991 1102158 0.0 0.3 0.3 + (argusagent) 39850 1 1684980673 1040476 0.0 0.3 0.3 [D-STASK] COMMAND PID PPID STACK (load_calc) 141 141 [<0>] load_calc_func+0x57/0x130 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x1f/0x30 [<0>] 0xffffffffffffffff - (cat) 28209 28209 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28210 28210 [<0>] open_proc+0x53/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28211 28211 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28212 28212 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28213 28213 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28214 28214 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28215 28215 [<0>] open_proc+0x2d/0x7e [mutex_block] - [<0>] proc_reg_open+0x72/0x130 - [<0>] do_dentry_open+0x23a/0x3a0 - [<0>] path_openat+0x768/0x13e0 - [<0>] do_filp_open+0x99/0x110 - [<0>] do_sys_open+0x12e/0x210 - [<0>] do_syscall_64+0x55/0x1a0 - [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 - [<0>] 0xffffffffffffffff - (cat) 28216 28216 [<0>] open_proc+0x2d/0x7e [mutex_block] + (cat) 71119 71119 [<0>] open_proc+0x2d/0x7e [mutex_block] [<0>] proc_reg_open+0x72/0x130 [<0>] do_dentry_open+0x23a/0x3a0 [<0>] path_openat+0x768/0x13e0 @@ -435,7 +448,8 @@ D状态在内核中出现的路径很多,锁的竞争是其中的一种情况 [<0>] do_syscall_64+0x55/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0>] 0xffffffffffffffff - (cat) 28217 28217 [<0>] open_proc+0x2d/0x7e [mutex_block] + .................. + (cat) 71128 71128 [<0>] open_proc+0x2d/0x7e [mutex_block] [<0>] proc_reg_open+0x72/0x130 [<0>] do_dentry_open+0x23a/0x3a0 [<0>] path_openat+0x768/0x13e0 @@ -444,9 +458,19 @@ D状态在内核中出现的路径很多,锁的竞争是其中的一种情况 [<0>] do_syscall_64+0x55/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<0>] 0xffffffffffffffff + WARN: The most stack, times=8 + [<0>] open_proc+0x2d/0x7e [mutex_block] + [<0>] proc_reg_open+0x72/0x130 + [<0>] do_dentry_open+0x23a/0x3a0 + [<0>] path_openat+0x768/0x13e0 + [<0>] do_filp_open+0x99/0x110 + [<0>] do_sys_open+0x12e/0x210 + [<0>] do_syscall_64+0x55/0x1a0 + [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [<0>] 0xffffffffffffffff -可以看到抓到的D状态task的命令、ppid、pid等信息,以及这些task当前的内核栈。 +可以看到抓到的D状态task的命令、ppid、pid等信息,以及这些task当前的内核栈。其中最多的阻塞调用栈位于open_proc函数。 ## 3. Tasktop性能测试