diff --git a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch b/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch deleted file mode 100644 index e6348c7bed408c027b6313ace99cb1054207780a..0000000000000000000000000000000000000000 --- a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch +++ /dev/null @@ -1,6172 +0,0 @@ -diff --git a/man/man8/alibiolatency.8 b/man/man8/alibiolatency.8 -new file mode 100644 -index 0000000..93f52fa ---- /dev/null -+++ b/man/man8/alibiolatency.8 -@@ -0,0 +1,76 @@ -+.TH alibiolatency 8 "2019-08-25" "USER COMMANDS" -+.SH NAME -+alibiolatency \- Summarize block device I/O latency. -+.SH SYNOPSIS -+.B alibiolatency [\-h] [\-i interval] [-d device] [\-t avg_req_time] [\-T req_time] [-r] -+.SH DESCRIPTION -+ -+Alibiolatency can trace every io request's life cycle and do data filter and statistics -+accordingly. With this tool, we can clearly know io request's different stage's latency -+proportion, for example, if io request D2C(see blktrace) time take majority in total io -+request process time, it's most likely disk bottleneck has been reached. -+ -+This tool uses in-kernel eBPF maps for storing timestamps for efficiency. -+ -+This works by tracing various kernel block tracepoints using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-i interval -+Specify display interval, unit is second. -+.TP -+\-r -+Dump every io request life cycle. -+.TP -+\-d device -+Inspect specified device. -+.TP -+\-t avg_req_time -+Display only when average request process time is greater than this value, unit is ms. -+.TP -+\-T req_time -+Dump request life cycle when single request process time is greater than this value, unit is ms. -+.SH EXAMPLES -+.TP -+Summarize block device I/O latency(default display interval is 2 seconds): -+# -+.B alibiolatency -+.TP -+Print statistics per 5 seconds -+# -+.B alibiolatency \-i 5 -+.TP -+Print only when average request process time is greater than 10ms -+# -+.B alibiolatency \-t 10 -+.TP -+Print request life cycle when single request process time is greater than 20ms. -+# -+.B alibiolatency \-T 20 -+.SH OVERHEAD -+This traces kernel functions and maintains in-kernel timestamps and a histogram, -+which are asynchronously copied to user-space. This method is very efficient, -+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. -+If you have a higher IOPS storage environment, test and quantify the overhead -+before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/aliexitsnoop.8 b/man/man8/aliexitsnoop.8 -new file mode 100644 -index 0000000..cd7215e ---- /dev/null -+++ b/man/man8/aliexitsnoop.8 -@@ -0,0 +1,109 @@ -+.TH aliexitsnoop 8 "2019-05-28" "USER COMMANDS" -+.SH NAME -+aliexitsnoop \- Trace all process termination (exit, fatal signal). Uses Linux eBPF/bcc. -+.SH SYNOPSIS -+.B aliexitsnoop [\-h] [\-t] [\-\-utc] [\-x] [\-p PID] [\-\-label LABEL] -+.SH DESCRIPTION -+aliexitsnoop traces process termination, showing the command name and reason for -+termination, either an exit or a fatal signal. This Linux tool is actually an enhanced -+version of the exitsnoop. For more details about exitsnoop, see exitsnoop_example.txt. -+Compared to the exitsnoop, aliexitsnoop not only print the exit code, pid, ppid, lifetime -+of the exiting processes, but also the stack of the exiting processes. The demand stems -+from the fact that in the production environment, some processes end up unexpectedly. -+Therefore we need to dump the stack of the target processes to find out what caused the -+target process to exit. -+ -+It catches processes of all users, processes in containers, as well -+as processes that become zombie. -+ -+This works by tracing the kernel do_exit() function, -+and will need updating to match any changes to this function. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-t -+Include a timestamp column. -+.TP -+\-\-utc -+Include a timestamp column, use UTC timezone. -+.TP -+\-x -+Exclude successful exits, exit( 0 ) -+.TP -+\-p PID -+Trace this process ID only (filtered in-kernel). -+.TP -+\-\-label LABEL -+Label each line with LABEL (default 'exit') in first column (2nd if timestamp is present). -+.SH EXAMPLES -+.TP -+Trace all process termination -+# -+.B aliexitsnoop -+.TP -+Trace all process termination, and include timestamps: -+# -+.B aliexitsnoop \-t -+.TP -+Exclude successful exits, only include non-zero exit codes and fatal signals: -+# -+.B aliexitsnoop \-x -+.TP -+Trace PID 181 only: -+# -+.B aliexitsnoop \-p 181 -+.TP -+Label each output line with 'EXIT': -+# -+.B aliexitsnoop \-\-label EXIT -+.SH FIELDS -+.TP -+TIME-TZ -+Time of process termination HH:MM:SS.sss with milliseconds, where TZ is -+the local time zone, 'UTC' with \-\-utc option. -+.TP -+LABEL -+The optional label if \-\-label option is used. This is useful with the -+\-t option for timestamps when the output of several tracing tools is -+sorted into one combined output. -+.TP -+PCOMM -+Process/command name. -+.TP -+PID -+Process ID -+.TP -+PPID -+The process ID of the process that will be notified of PID termination. -+.TP -+TID -+Thread ID. -+.TP -+EXIT_CODE -+The exit code for exit() or the signal number for a fatal signal. -+.SH OVERHEAD -+This traces the kernel do_exit() function and prints output for each event. -+As the rate of this is generally expected to be low (< 1000/s), the overhead is also -+expected to be negligible. If you have an application that has a high rate of -+process termination, then test and understand overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Alvin Zheng -+.SH SEE ALSO -+aliexecsnoop(8) -diff --git a/man/man8/aliext4writeslower.8 b/man/man8/aliext4writeslower.8 -new file mode 100644 -index 0000000..007a474 ---- /dev/null -+++ b/man/man8/aliext4writeslower.8 -@@ -0,0 +1,70 @@ -+.TH aliext4writeslower 8 "2019-08-30" "USER COMMANDS" -+.SH NAME -+aliext4writeslower \- Trace common ext4 write(2) slower than a threshold -+.SH SYNOPSIS -+.B aliext4writeslower [-h] [-p PID] [-d device] [min_ms] -+.SH DESCRIPTION -+ -+This script traces key steps of write(2) which maybe time-consuming, -+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction -+checkpoint or commit), page cache allocation, block allocation and -+balance dirty pages, and we also take into account sched delay. -+ -+With this tool, we can clearly know write(2)'s latency proportion. -+ -+This tool uses in-kernel eBPF maps for storing timestamps for efficiency. -+ -+This works by tracing various kernel functions using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-i pid -+trace this PID only -+.TP -+min_ms -+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. -+.SH EXAMPLES -+.TP -+Trace write(2) slower than 10 ms (default). -+# -+.B aliext4writeslower -+.TP -+Trace write(2) slower than 5 ms. -+# -+.B aliext4writeslower 5 -+.TP -+Trace PID 185 only. -+# -+.B aliext4writeslower -p 185 -+.TP -+Trace /dev/sda5 which contains ext4 fs. -+# -+.B aliext4writeslower -d sda5 -+.TP -+.SH OVERHEAD -+This traces kernel functions and maintains in-kernel timestamps and a histogram, -+which are asynchronously copied to user-space. This method is very efficient, -+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. -+If you have a higher IOPS storage environment, test and quantify the overhead -+before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/alihardirqs.8 b/man/man8/alihardirqs.8 -new file mode 100644 -index 0000000..d874e89 ---- /dev/null -+++ b/man/man8/alihardirqs.8 -@@ -0,0 +1,99 @@ -+.TH alihardirqs 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alihardirqs \- Summarize hard IRQ (interrupt) event time. -+.SH SYNOPSIS -+.B alihardirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes hard irq statistics. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-c -+Show hardIRQ statistics on specific CPU only. -+.TP -+\-i -+Show statistics of specific hardIRQ only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize hardIRQ statistics: -+# -+.B alihardirqs -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alihardirqs 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alihardirqs \-NT 1 -+.TP -+Show hardIRQ statistics as histograms: -+# -+.B alihardirqs \-D -+.TP -+Show hardIRQ statistics of CPU 0 only: -+# -+.B alihardirqs \-c 0 -+.TP -+Show statistics of hardIRQ 25 only: -+# -+.B alihardirqs \-i 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+HARDIRQ -+hardIRQ number. -+.TP -+IRQHandler -+Name of hardIRQ handler. -+.TP -+Count -+Number of samples on specific CPU or hardIRQ. -+.TP -+Total_ -+Number of time spent on specific hardIRQ. -+.TP -+MAX_ -+Maximum of time spent on specific hardIRQ. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every irq_handler_entry/irq_handler_exit tracepoint, and in-kernel -+summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alijbd2stat.8 b/man/man8/alijbd2stat.8 -new file mode 100644 -index 0000000..4f2de07 ---- /dev/null -+++ b/man/man8/alijbd2stat.8 -@@ -0,0 +1,61 @@ -+.TH alijbd2stat 8 "2019-08-25" "USER COMMANDS" -+.SH NAME -+alijbd2stat \- Summarize filesystem jbd2 stat. -+.SH SYNOPSIS -+.B alijbd2stat [\-h] [\-d device] [\-t thresh] -+.SH DESCRIPTION -+ -+Alijbd2stat can trace every jbd2 handle and every transaction commit info. -+ -+If one jbd2 handle's running time is greater than user specified threshold time, Alijbd2stat -+will catch this handle's context. -+ -+This tool uses in-kernel eBPF maps for storing timestamps and the histogram, -+for efficiency. -+ -+This works by tracing various kernel block tracepoints using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-d device -+Inspect specified device. -+.TP -+\-t thresh -+Show jbd2 handle's context when its run time is greater than this threshold. -+.SH EXAMPLES -+.TP -+Print all jbd2 instances stat. -+# -+.B alijbd2stat -+.TP -+Inspect specified device /dev/sda3, on which some filesystem uses jbd2. -+# -+.B alijbd2stat \-d sda3 -+.TP -+Show jbd2 handle's context when it runs more than 10ms -+# -+.B alijbd2stat \-t 10 -+.TP -+.SH OVERHEAD -+Test and quantify the overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/alimutexsnoop.8 b/man/man8/alimutexsnoop.8 -new file mode 100644 -index 0000000..f0342b4 ---- /dev/null -+++ b/man/man8/alimutexsnoop.8 -@@ -0,0 +1,71 @@ -+.TH alimutexsnoop 8 "2019-05-28" "USER COMMANDS" -+.SH NAME -+alimutexsnoop \- Trace the mutex that is held by the process longer than a specified time threshold. Uses Linux eBPF/bcc. -+.SH SYNOPSIS -+.B alimutexsnoop [\-h] [\-t THRESHOLD] [\-p PID] -+.SH DESCRIPTION -+alimutexsnoop monitors the mutex that is held by the process longer than -+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes, -+it prints the pid of the process that held the mutex and the address of the -+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the -+process. -+ -+This works by tracing the kernel mutex_lock() and mutex_unlock() function, -+and will need updating to match any changes to this function. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-t -+Set the time threshold. The mutexes held longer than this time threshold will be traced(us). -+.TP -+\-p PID -+Trace this process ID only (filtered in-kernel). -+.SH EXAMPLES -+.TP -+Trace all mutexes that are held longer than 1000us (default): -+# -+.B alimutexsnoop -+.TP -+Trace all mutexes that are held longer than 50us: -+# -+.B alimutexsnoop \-t 50 -+.TP -+Trace PID 181 only: -+# -+.B alimutexsnoop \-p 181 -+.SH FIELDS -+.TP -+PCOMM -+Process/command name. -+.TP -+PID -+Process ID -+.TP -+MUTEX_ADDR -+The address of the mutex object. -+.SH OVERHEAD -+This traces the kernel mutex_lock() and mutex_unlock() function and compute the time -+length of the process holds the mutex. The overhead is positively related to the frequency -+of acquiring and releasing mutex. If you have an application that gets and releases mutexes -+frequently, then test and understand overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Alvin Zheng -+.SH SEE ALSO -+alimutexsnoop(8) -diff --git a/man/man8/alirunqinfo.8 b/man/man8/alirunqinfo.8 -new file mode 100644 -index 0000000..1ffa3cb ---- /dev/null -+++ b/man/man8/alirunqinfo.8 -@@ -0,0 +1,109 @@ -+.TH alirunqinfo 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alirunqinfo \- Summarize runqueue statistics. -+.SH SYNOPSIS -+.B alirunqinfo [\-h] [\-T] [\-N] [\-D] [\-C] [\-P] [\-c CPU] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes task on-CPU time before being descheduled. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-C -+Show per-CPU runqueue statistics. -+.TP -+\-P -+Show per-thread runqueue statistics. -+.TP -+\-c -+Show runqueue statistics on specific CPU only. -+.TP -+\-p -+Show runqueue statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize runqueue statistics: -+# -+.B alirunqinfo -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alirunqinfo 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alirunqinfo \-NT 1 -+.TP -+Show per-CPU runqueue statistics: -+# -+.B alirunqinfo \-C -+.TP -+Show per-thread runqueue statistics: -+# -+.B alirunqinfo \-P -+.TP -+Show runqueue statistics as histograms: -+# -+.B alirunqinfo \-D -+.TP -+Show runqueue statistics of CPU 0 only: -+# -+.B alirunqinfo \-c 0 -+.TP -+Show runqueue statistics of PID 25 only: -+# -+.B alirunqinfo \-p 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+PID -+PID of sampled process. -+.TP -+Thread -+Name of the specified process. -+.TP -+Count -+Number of samples on specific CPU or PID. -+.TP -+Total_ -+Number of task on-CPU time accumulated on specific CPU or PID. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every sched_switch tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisoftirqs.8 b/man/man8/alisoftirqs.8 -new file mode 100644 -index 0000000..1fc9c50 ---- /dev/null -+++ b/man/man8/alisoftirqs.8 -@@ -0,0 +1,99 @@ -+.TH alisoftirqs 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisoftirqs \- Summarize soft IRQ (interrupt) event time. -+.SH SYNOPSIS -+.B alisoftirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes soft irq statistics. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-c -+Show softIRQ statistics on specific CPU only. -+.TP -+\-i -+Show statistics of specific softIRQ only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize softIRQ statistics: -+# -+.B alisoftirqs -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisoftirqs 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisoftirqs \-NT 1 -+.TP -+Show softIRQ statistics as histograms: -+# -+.B alisoftirqs \-D -+.TP -+Show softIRQ statistics of CPU 0 only: -+# -+.B alisoftirqs \-c 0 -+.TP -+Show statistics of softIRQ 25 only: -+# -+.B alisoftirqs \-i 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+SOFTIRQ -+softIRQ number. -+.TP -+IRQHandler -+Name of softIRQ handler. -+.TP -+Count -+Number of samples on specific CPU or softIRQ. -+.TP -+Total_ -+Number of time spent on specific softIRQ. -+.TP -+MAX_ -+Maximum of time spent on specific softIRQ. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every softirq_entry/softirq_exit tracepoint, and in-kernel -+summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisyscount.8 b/man/man8/alisyscount.8 -new file mode 100644 -index 0000000..284084a ---- /dev/null -+++ b/man/man8/alisyscount.8 -@@ -0,0 +1,112 @@ -+.TH alisyscount 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisyscount \- Summarize syscall counts and latencies. -+.SH SYNOPSIS -+.B alisyscount [\-h] [\-T] [\-N] [\-D] [\-S] [\-P] [\-s syscall] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes syscall counts and latencies. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-S -+Show per-syscall statistics. -+.TP -+\-P -+Show per-thread statistics. -+.TP -+\-s -+Show statistics of specific syscall only. -+.TP -+\-p -+Show syscall statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize syscall statistics: -+# -+.B alisyscount -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisyscount 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisyscount \-NT 1 -+.TP -+Show syscall statistics as histograms: -+# -+.B alisyscount \-D -+.TP -+Show per-syscall statistics: -+# -+.B alisyscount \-S -+.TP -+Show per-thread syscall statistics: -+# -+.B alisyscount \-P -+.TP -+Show statistics of syscall 25 only: -+# -+.B alisyscount \-s 25 -+.TP -+Show syscall statistics of PID 1213 only: -+# -+.B alisyscount \-p 1213 -+.SH FIELDS -+.TP -+SysNR -+syscall number. -+.TP -+Syscall -+Name of syscall. -+.TP -+PID -+PID of specific process. -+.TP -+Thread -+Name of specific process. -+.TP -+Count -+Number of samples on specific PID or syscall. -+.TP -+Total_ -+Number of time spent on specific syscall. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every sys_enter/sys_exit tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisysdelay.8 b/man/man8/alisysdelay.8 -new file mode 100644 -index 0000000..a7d91aa ---- /dev/null -+++ b/man/man8/alisysdelay.8 -@@ -0,0 +1,71 @@ -+.TH alisysdelay 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisysdelay \- Catches tasks hung in kernel mode and get its stack. -+.SH SYNOPSIS -+.B alisysdelay [\-h] [\-d] [\-a] [\-f frequency] [\-t threshold] [\-\-stack\-storage\-size size] -+.SH DESCRIPTION -+This program catches tasks hung in kernel mode and get its stack. -+ -+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+may be unacceptable, because some tasks may run too long in kernel mode -+without calling cond_sched(). These tasks may be kernel thread, or user -+processes that enter kernel mode by syscall. -+ -+alisysdelay can catch tasks that have been running continuously for a long time -+whithout scheduling, and kernel/user stackes of these tasks will be printed. -+The threshold can be user defined. -+ -+By default CPU idle stacks are excluded by simply excluding PID 0. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-d -+Insert delimiter between kernel/user stacks. -+.TP -+\-a -+Add _[k] annotations to kernel frames. -+.TP -+\-f -+Sample frequency in Hertz. -+.TP -+\-t -+Set the threshold of delay in kernel mode, in us. -+.TP -+\-\-stack\-storage\-size -+The number of unique stack traces that can be stored and displayed. -+.SH EXAMPLES -+.TP -+Catches tasks hung in kernel mode and get its stack: -+# -+.B alisysdelay -+.TP -+Catches tasks hung in kernel mode at 99 Hz: -+# -+.B alisysdelay \-f 99 -+.TP -+Catches tasks that hung in kernel mode for more than 100us: -+# -+.B alisysdelay \-t 100 -+.SH FIELDS -+.SH OVERHEAD -+This sampls at every sys_enter/sched_switch tracepoint and _cond_resched kprobe, -+and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisyslatency.8 b/man/man8/alisyslatency.8 -new file mode 100644 -index 0000000..854b7a0 ---- /dev/null -+++ b/man/man8/alisyslatency.8 -@@ -0,0 +1,85 @@ -+.TH alisyslatency 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisyslatency \- Summarize time spent in kernel context.. -+.SH SYNOPSIS -+.B alisyslatency [\-h] [\-T] [\-N] [\-c CPU] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes time spent in kernel context.. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-c -+Show statistics on specific CPU only. -+.TP -+\-p -+Show statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize time in kernel context per CPU: -+# -+.B alisyslatency -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisyslatency 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisyslatency \-NT 1 -+.TP -+Show time in kernel context in CPU 0 only: -+# -+.B alisyslatency \-c 0 -+.TP -+Show time in kernel context in pid 25 only: -+# -+.B alisyslatency \-p 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+PID -+PID of sampled process. -+.TP -+Count -+Number of samples on specific CPU or PID. -+.TP -+Total_ -+Number of time spent on specific PID. -+.TP -+MAX_ -+Maximum of time spent on specific PID. -+.SH OVERHEAD -+This sampls at every user_exit/user_enter tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/aliworkslower.8 b/man/man8/aliworkslower.8 -new file mode 100644 -index 0000000..4dac8fb ---- /dev/null -+++ b/man/man8/aliworkslower.8 -@@ -0,0 +1,59 @@ -+.TH aliworkslower 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+aliworkslower \- Trace high work sched latency. -+.SH SYNOPSIS -+.B aliworkslower [\-h] [threshold] -+.SH DESCRIPTION -+This program traces high work sched latency. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+[threshold] -+Set threshold of Q2S latency, in microsecond. -+.SH EXAMPLES -+.TP -+Trace work sched latency higher than 10000 us (default): -+# -+.B aliworkslower -+.TP -+Trace work sched latency higher than 1000 us: -+# -+.B aliworkslower 1000 -+.SH FIELDS -+.TP -+Worker Thread -+PID of worker thread. -+.TP -+Work -+Work callback. -+.TP -+Total Q2S latency -+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us. -+.TP -+Q2A latency -+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us. -+.TP -+A2S latency -+Latency between workqueue_activate_work (A) and workqueue_execute_start (S), in us. -+.SH OVERHEAD -+This sampls at every workqueue_queue_work/workqueue_activate_work/ -+workqueue_execute_start tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py -index 9222e37..db51793 100755 ---- a/tests/python/test_tools_smoke.py -+++ b/tests/python/test_tools_smoke.py -@@ -421,5 +421,53 @@ class SmokeTests(TestCase): - # Fails to attach the probe if zfs is not installed. - pass - -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alirunqinfo(self): -+ self.run_with_duration("alirunqinfo.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alihardirqs(self): -+ self.run_with_duration("alihardirqs.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisoftirqs(self): -+ self.run_with_duration("alisoftirqs.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisyscount(self): -+ self.run_with_duration("alisyscount.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisysdelay(self): -+ self.run_with_int("alisysdelay.py", timeout=10, kill_timeout=15) -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisyslatency(self): -+ self.run_with_duration("alisyslatency.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_aliworkslower(self): -+ self.run_with_int("aliworkslower.py", timeout=10, kill_timeout=15) -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_aliexitsnoop(self): -+ self.run_with_int("aliexitsnoop.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alimutexsnoop(self): -+ self.run_with_int("alimutexsnoop.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alibiolatency(self): -+ self.run_with_int("alibiolatency.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alijbd2stat(self): -+ self.run_with_int("alijbd2stat.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alijbd2stat(self): -+ self.run_with_int("aliext4writeslower.py ") -+ - if __name__ == "__main__": - main() -diff --git a/tools/alibiolatency.py b/tools/alibiolatency.py -new file mode 100755 -index 0000000..07f98d3 ---- /dev/null -+++ b/tools/alibiolatency.py -@@ -0,0 +1,532 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alibiolatency Summarize block device I/O latency. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alibiolatency [-h] [-i interval] [-d device] [-t avg_req_time] [-T req_time] [-r] -+# -+# Copyright (c) 2019-2021 Alibaba Group. -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019/07/01 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import ctypes as ct -+import argparse -+import time -+ -+# arguments -+examples = """examples: -+ ./alibiolatency # summarize block I/O latency(default display interval is 2s) -+ ./alibiolatency -d sda3 # inspect specified device /dev/sda3 -+ ./alibiolatency -i 2 # specify display interval, 2s -+ ./alibiolatency -t 10 # display only when average request process time is greater than 10ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -r # dump every io request life cycle -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize block device I/O latency", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("-i", "--dis_interval", nargs="?", default=2, -+ help="specify display interval") -+parser.add_argument("-t", "--avg_threshold_time", nargs="?", default=0, -+ help="display only when average request process time is greater than this value") -+parser.add_argument("-T", "--threshold_time", nargs="?", default=999999999, -+ help="dump request life cycle when single request process time is greater than this value") -+parser.add_argument("-r", "--dump_raw", action="store_true", help="dump every io request life cycle") -+parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) -+args = parser.parse_args() -+threshold_time = int(args.threshold_time) -+avg_threshold_time = int(args.avg_threshold_time) -+dis_interval = int(args.dis_interval) -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+ -+struct req_stat { -+ dev_t dev; -+ u64 queue_us; -+ u64 getrq_us; -+ u64 insert_us; -+ u64 issue_us; -+ u64 completion_us; -+ u64 sector; -+ u32 num_sectors; -+ char rwbs[8]; -+}; -+ -+typedef struct disk_key { -+ dev_t dev; -+ u64 sector; -+} disk_key_t; -+ -+BPF_HASH(io_requests, disk_key_t, struct req_stat); -+BPF_PERF_OUTPUT(req_stat); -+ -+ -+TRACEPOINT_PROBE(block, block_bio_queue) -+{ -+ disk_key_t key = {}; -+ struct req_stat s = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ s.dev = key.dev; -+ s.queue_us = bpf_ktime_get_ns() / 1000; -+ bpf_probe_read(s.rwbs, 8, args->rwbs); -+ io_requests.update(&key, &s); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_getrq) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->getrq_us = bpf_ktime_get_ns() / 1000; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_bio_backmerge) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_bio_frontmerge) -+{ -+ struct req_stat *r; -+ struct req_stat *r2; -+ disk_key_t key = {}; -+ disk_key_t key2 = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector + args->nr_sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ key2.dev = args->dev; -+ key2.sector = args->sector; -+ r2 = io_requests.lookup(&key2); -+ if (r2 == NULL) { -+ io_requests.delete(&key); -+ return 0; -+ } -+ -+ // keep old value. -+ r2->queue_us = r->queue_us; -+ r2->getrq_us = r->getrq_us; -+ r2->insert_us = r->insert_us; -+ r2->issue_us = r->issue_us; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_insert) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->insert_us = bpf_ktime_get_ns() / 1000; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_issue) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->issue_us = bpf_ktime_get_ns() / 1000; -+ r->num_sectors = args->nr_sector; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_complete) -+{ -+ struct req_stat *r; -+ struct req_stat val = {}; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ /* could be happen. -+ if (r->num_sectors != args->nr_sector) -+ bpf_trace_printk("here"); -+ */ -+ r->completion_us = bpf_ktime_get_ns() / 1000; -+ /* FIXME: should we use r? */ -+ //val = *r; -+ val.dev = r->dev; -+ val.queue_us = r->queue_us; -+ val.getrq_us = r->getrq_us; -+ val.insert_us = r->insert_us; -+ val.issue_us = r->issue_us; -+ val.completion_us = r->completion_us; -+ val.sector = args->sector; -+ val.num_sectors = args->nr_sector; -+ memcpy(val.rwbs, r->rwbs, 8); -+ -+ io_requests.delete(&key); -+ -+ req_stat.perf_submit(args, &val, sizeof(struct req_stat)); -+ return 0; -+} -+ -+int trace_elv_merge_requests(struct pt_regs *ctx, struct request_queue *q, -+ struct request *rq, struct request *next) -+{ -+ disk_key_t key = {}; -+ dev_t dev = (next->rq_disk)->part0.__dev.devt; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = dev; -+ key.sector = next->__sector; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+""" -+ -+dev_name={} -+ -+def init_dev_name(): -+ global dev_name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ dev_name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+name2devid={v:k for k,v in dev_name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if args.dump_raw: -+ dump_raw = 1 -+else: -+ dump_raw = 0 -+ -+# code substitutions -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# load BPF program -+b = BPF(text=bpf_text) -+b.attach_kprobe(event="elv_merge_requests", fn_name="trace_elv_merge_requests") -+ -+print("Tracing block device I/O... Hit Ctrl-C to end.") -+ -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("queue_us", ct.c_ulonglong), -+ ("getrq_us", ct.c_ulonglong), -+ ("insert_us", ct.c_ulonglong), -+ ("issue_us", ct.c_ulonglong), -+ ("completion_us", ct.c_ulonglong), -+ ("sector", ct.c_ulonglong), -+ ("num_sectors", ct.c_uint), -+ ("rwbs", ct.c_char * 8) -+ ] -+ -+io_stats={} -+last_dis_time=0 -+ -+class req_stats: -+ def __init__(self, dev): -+ self.dev=dev -+ self.sectors=[0, 0, 0] -+ self.num_reqs=[0, 0, 0] -+ self.lat=[0, 0, 0] -+ self.lat_max=[0, 0, 0] -+ self.lat_min=[99999999, 99999999, 99999999] -+ self.i2d_lat=[0, 0, 0] -+ self.i2d_lat_max=[0, 0, 0] -+ self.i2d_lat_min=[99999999, 99999999, 99999999] -+ self.d2c_lat=[0, 0, 0] -+ self.d2c_lat_max=[0,0,0] -+ self.d2c_lat_min=[99999999, 99999999, 99999999] -+ -+ def update_req_stats(self, io_type, num_sectors, lat, i2d_lat, d2c_lat): -+ self.num_reqs[io_type] += 1 -+ self.sectors[io_type] += num_sectors -+ self.lat[io_type] += lat -+ self.i2d_lat[io_type] += i2d_lat -+ self.d2c_lat[io_type] += d2c_lat -+ -+ if lat > self.lat_max[io_type]: -+ self.lat_max[io_type] = lat; -+ -+ if lat < self.lat_min[io_type]: -+ self.lat_min[io_type] = lat; -+ -+ if i2d_lat > self.i2d_lat_max[io_type]: -+ self.i2d_lat_max[io_type] = i2d_lat -+ -+ if i2d_lat < self.i2d_lat_min[io_type]: -+ self.i2d_lat_min[io_type] = i2d_lat -+ -+ if d2c_lat > self.d2c_lat_max[io_type]: -+ self.d2c_lat_max[io_type] = d2c_lat -+ -+ if d2c_lat < self.d2c_lat_min[io_type]: -+ self.d2c_lat_min[io_type] = d2c_lat -+ -+ def display_req_stats(self, time_interval): -+ global avg_threshold_time -+ -+ ti = time_interval / 1000.0 / 1000.0 -+ -+ print("dev: %s" % dev_name[self.dev]) -+ for io_type in (0, 1, 2): -+ if self.num_reqs[io_type] == 0: -+ continue -+ total_reqs = self.num_reqs[io_type] -+ -+ iops = int(total_reqs / ti) -+ avg_lat = self.lat[io_type] / 1000.0 / total_reqs -+ if avg_lat < avg_threshold_time: -+ continue; -+ -+ speed = self.sectors[io_type] * 512 / 1024 / 1024 / ti -+ avg_req_sz = self.sectors[io_type] * 512 / 1024 / total_reqs -+ -+ i2d_avg = self.i2d_lat[io_type] / 1000.0 / total_reqs -+ i2d_percent = int(i2d_avg / avg_lat * 100) -+ -+ d2c_avg = self.d2c_lat[io_type] / 1000.0 / total_reqs -+ d2c_percent = int(d2c_avg / avg_lat * 100) -+ -+ if io_type == 0: -+ io_type_str="READ" -+ elif io_type == 1: -+ io_type_str="WRITE" -+ else: -+ io_type_str="DISCARD" -+ -+ print(" %s iops:%d avg:%.2f min:%.2f max:%.2f speed:%.2fMB/s avgrq-sz:%dKB" % -+ (io_type_str, iops, avg_lat, self.lat_min[io_type] / 1000.0, -+ self.lat_max[io_type] / 1000.0, speed, avg_req_sz)) -+ -+ print(" [i2d_avg: %.2f percent:%d min:%.2f max:%.2f" % (i2d_avg, -+ i2d_percent, self.i2d_lat_min[io_type] / 1000.0, -+ self.i2d_lat_max[io_type] / 1000.0)) -+ -+ print(" [d2c_avg: %.2f percent:%d min:%.2f max:%.2f" % (d2c_avg, -+ d2c_percent, self.d2c_lat_min[io_type] / 1000.0, -+ self.d2c_lat_max[io_type] / 1000.0)) -+ -+ -+def get_io_type(rwbs): -+ # please see logic in kernel func blk_fill_rwbs() -+ if rwbs[0] == 'R' or rwbs[0:2] == "FR": -+ return 0; -+ elif rwbs[0] == 'W' or rwbs[0:2] == "FW": -+ return 1; -+ return 2 -+ -+def is_exception_req(event): -+ # we count this as a abnomal request... -+ # Currently I still hasn't figured out in which case getrq_us -+ # will be zero... -+ if event.getrq_us == 0: -+ print("dev:%s act:%s sector:%lu bytes:%u exception req Q:%lu G:%lu I:%lu D:%lu C:%lu" % ( -+ dev_name[event.dev], event.rwbs, event.sector, event.num_sectors, event.queue_us, -+ event.getrq_us, event.insert_us, event.issue_us, event.completion_us)) -+ -+def dump_raw_trace_event(dev, act, ts, event): -+ print("%s %s %lu sector:%lu bytes:%u act:%s" % (dev, act, ts / 1000, -+ event.sector, event.num_sectors, event.rwbs)) -+ -+def dump_timeout_req(event): -+ dev = dev_name[event.dev] -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ print("%s" % localtime) -+ print("timeouted request sector:%lu bytes:%u req time:%lu" % (event.sector, -+ event.num_sectors, (event.completion_us - event.queue_us) / 1000)) -+ dump_raw_trace_event(dev, "Q", event.queue_us, event) -+ dump_raw_trace_event(dev, "G", event.getrq_us, event) -+ dump_raw_trace_event(dev, "I", event.insert_us, event) -+ dump_raw_trace_event(dev, "D", event.issue_us, event) -+ dump_raw_trace_event(dev, "C", event.completion_us, event) -+ -+def dump_req_life_cycle(event): -+ dev = dev_name[event.dev] -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ print("%s" % localtime) -+ print("raw request event sector:%lu bytes:%u req time:%lu" % (event.sector, -+ event.num_sectors, (event.completion_us - event.queue_us) / 1000)) -+ dump_raw_trace_event(dev, "Q", event.queue_us, event) -+ dump_raw_trace_event(dev, "G", event.getrq_us, event) -+ dump_raw_trace_event(dev, "I", event.insert_us, event) -+ dump_raw_trace_event(dev, "D", event.issue_us, event) -+ dump_raw_trace_event(dev, "C", event.completion_us, event) -+ -+# process event -+def print_event(cpu, data, size): -+ global io_stats -+ global last_dis_time -+ -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ -+ is_exception_req(event) -+ -+ if dump_raw != 0: -+ dump_req_life_cycle(event) -+ -+ lat = event.completion_us - event.queue_us; -+ -+ if event.issue_us != 0: -+ d2c_lat = event.completion_us - event.issue_us -+ elif event.insert_us != 0: -+ d2c_lat = event.completion_us - event.insert_us -+ elif event.getrq_us != 0: -+ d2c_lat = event.completion_us - event.getrq_us -+ else: -+ d2c_lat = lat -+ -+ # Note: this could happen... -+ # In kernel 4.9, for example, scsi_request_fn calls blk_peek_request() -+ # to fetch a request, but if later scsi driver could not process this -+ # request(see not_ready lable in scsi_request_fn), it will call -+ # blk_requeue_request() to put back this request to block elevator, now -+ # insert_us will be greater than issue_us. -+ # -+ # Later scsi_request_fn will pick this request again, but still it could -+ # not handle this request, see codes in scsi_request_fn -+ # if (!scsi_dev_queue_ready(q, sdev)) -+ # break; -+ # then this request will still be in block elevator, but with RQF_STARTED -+ # marked. -+ # -+ # Finally scsi_request_fn calls blk_peek_request() to process this request, -+ # because RQF_STARTED is set, and this time blk_peek_request won't call -+ # trace_block_rq_issue(), then issue_us also won't be updated. -+ # Now insert_us is greater than issue_us. -+ if event.issue_us < event.insert_us: -+ print("lege %u %s sector:%lu bytes:%u %lu %lu" % (dev_name[event.dev], -+ event.rwbs, event.sector, event.num_sectors, event.insert_us, -+ event.issue_us)) -+ -+ if event.insert_us != 0 and event.issue_us > event.insert_us: -+ i2d_lat = event.issue_us - event.insert_us; -+ else: -+ i2d_lat = 0 -+ io_type = get_io_type(event.rwbs) -+ -+ if last_dis_time == 0: -+ last_dis_time = event.completion_us -+ -+ if not io_stats.has_key(event.dev): -+ io_stats[event.dev] = req_stats(event.dev) -+ -+ if lat > threshold_time * 1000: -+ dump_timeout_req(event) -+ -+ io_stats[event.dev].update_req_stats(io_type, event.num_sectors, lat, i2d_lat, d2c_lat) -+ if event.completion_us > last_dis_time and (event.completion_us - last_dis_time) > dis_interval * 1000 * 1000: -+ time_interval = event.completion_us - last_dis_time; -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ keys = io_stats.keys() -+ keys.sort() -+ print("%s" % localtime) -+ for k in keys: -+ io_stats[k].display_req_stats(time_interval) -+ last_dis_time = event.completion_us -+ -+ print("\n") -+ io_stats.clear() -+ -+ -+b["req_stat"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alibiolatency_example.txt b/tools/alibiolatency_example.txt -new file mode 100644 -index 0000000..70afc6d ---- /dev/null -+++ b/tools/alibiolatency_example.txt -@@ -0,0 +1,119 @@ -+Demonstrations of alibiolatency. -+ -+Alibiolatency can trace every io request's life cycle and do data filter and statistics -+accordingly. With this tool, we can clearly know io request's different stage's latency -+proportion, for example, if io request D2C(see blktrace) time take majority in total io -+request process time, it's most likely disk bottleneck has been reached. -+ -+Usage: -+alibiolatency [-h] [-d DEVICE] [-i [DIS_INTERVAL]] [-t [AVG_THRESHOLD_TIME]] [-T [THRESHOLD_TIME]] [-r] -+ -+Summarize block device I/O latency -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -i [DIS_INTERVAL], --dis_interval [DIS_INTERVAL] -+ specify display interval -+ -t [AVG_THRESHOLD_TIME], --avg_threshold_time [AVG_THRESHOLD_TIME] -+ display only when average request process time is -+ greater than this value -+ -T [THRESHOLD_TIME], --threshold_time [THRESHOLD_TIME] -+ dump request life cycle when single request process -+ time is greater than this value -+ -r, --dump_raw dump every io request life cycle -+ -+examples: -+ ./alibiolatency # summarize block I/O latency(default display interval is 2s) -+ ./alibiolatency -d sda3 # inspect specified device /dev/sda3 -+ ./alibiolatency -i 2 # specify display interval, 2s -+ ./alibiolatency -t 10 # display only when average request process time is greater than 10ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -r # dump every io request life cycle -+ -+ -+Output example: -+ -+Example 1: -+# Summarize block device I/O latency periodically. -+python alibiolatency.py -+ -+Wed Aug 28 19:13:00 2019 -+dev: sda -+ READ iops:32 avg:15.36 min:0.02 max:143.51 speed:0.00MB/s avgrq-sz:4KB -+ [i2d_avg: 0.00 percent:0 min:0.00 max:0.01 -+ [d2c_avg: 15.26 percent:99 min:0.02 max:143.51 -+ WRITE iops:2 avg:0.06 min:0.04 max:0.09 speed:0.00MB/s avgrq-sz:24KB -+ [i2d_avg: 0.00 percent:2 min:0.00 max:0.00 -+ [d2c_avg: 0.05 percent:76 min:0.03 max:0.06 -+ -+ -+Wed Aug 28 19:13:03 2019 -+dev: sda -+ READ iops:62 avg:2.03 min:0.03 max:16.08 speed:0.68MB/s avgrq-sz:14KB -+ [i2d_avg: 0.00 percent:0 min:0.00 max:0.00 -+ [d2c_avg: 1.99 percent:97 min:0.03 max:16.07 -+ WRITE iops:6 avg:0.06 min:0.03 max:0.12 speed:0.00MB/s avgrq-sz:10KB -+ [i2d_avg: 0.00 percent:5 min:0.00 max:0.01 -+ [d2c_avg: 0.04 percent:76 min:0.03 max:0.07 -+ -+Example 2: -+# Summarize block device I/O latency periodically, and will print io request's -+# complete life cycle when this io request's process time is greater than 20ms. -+python alibiolatency.py -T 20 # Summarize block device I/O latency periodically. -+ -+timeouted request sector:269555712 bytes:1024 req time:379 -+sda Q 3102021522 sector:269555712 bytes:1024 act:W -+sda G 3102021522 sector:269555712 bytes:1024 act:W -+sda I 3102021522 sector:269555712 bytes:1024 act:W -+sda D 3102021522 sector:269555712 bytes:1024 act:W -+sda C 3102021902 sector:269555712 bytes:1024 act:W -+Wed Aug 28 19:28:44 2019 -+timeouted request sector:269556736 bytes:1024 req time:381 -+sda Q 3102021522 sector:269556736 bytes:1024 act:W -+sda G 3102021522 sector:269556736 bytes:1024 act:W -+sda I 3102021522 sector:269556736 bytes:1024 act:W -+sda D 3102021522 sector:269556736 bytes:1024 act:W -+sda C 3102021904 sector:269556736 bytes:1024 act:W -+Wed Aug 28 19:28:45 2019 -+dev: sda -+ READ iops:0 avg:0.07 min:0.07 max:0.07 speed:0.00MB/s avgrq-sz:4KB -+ [i2d_avg: 0.01 percent:17 min:0.01 max:0.01 -+ [d2c_avg: 0.05 percent:73 min:0.05 max:0.05 -+ WRITE iops:173 avg:339.79 min:0.04 max:501.72 speed:69.09MB/s avgrq-sz:408KB -+ [i2d_avg: 0.02 percent:0 min:0.00 max:0.06 -+ [d2c_avg: 339.16 percent:99 min:0.04 max:501.67 -+ -+Example 3: -+# Summarize block device I/O latency periodically, and will print every io -+# request's complete life cycle. -+python alibiolatency.py -r -+ -+Wed Aug 28 19:36:40 2019 -+raw request event sector:52903080 bytes:8 req time:0 -+sda Q 3102498371 sector:52903080 bytes:8 act:WS -+sda G 3102498371 sector:52903080 bytes:8 act:WS -+sda I 3102498371 sector:52903080 bytes:8 act:WS -+sda D 3102498371 sector:52903080 bytes:8 act:WS -+sda C 3102498371 sector:52903080 bytes:8 act:WS -+Wed Aug 28 19:36:40 2019 -+raw request event sector:2157826760 bytes:8 req time:0 -+sda Q 3102498785 sector:2157826760 bytes:8 act:W -+sda G 3102498785 sector:2157826760 bytes:8 act:W -+sda I 3102498785 sector:2157826760 bytes:8 act:W -+sda D 3102498785 sector:2157826760 bytes:8 act:W -+sda C 3102498785 sector:2157826760 bytes:8 act:W -+Wed Aug 28 19:36:42 2019 -+raw request event sector:208390056 bytes:8 req time:0 -+sda Q 3102500593 sector:208390056 bytes:8 act:W -+sda G 3102500593 sector:208390056 bytes:8 act:W -+sda I 3102500593 sector:208390056 bytes:8 act:W -+sda D 3102500593 sector:208390056 bytes:8 act:W -+sda C 3102500593 sector:208390056 bytes:8 act:W -+Wed Aug 28 19:36:42 2019 -+dev: sda -+ WRITE iops:42 avg:0.13 min:0.04 max:0.24 speed:0.00MB/s avgrq-sz:5KB -+ [i2d_avg: 0.01 percent:8 min:0.00 max:0.03 -+ [d2c_avg: 0.06 percent:48 min:0.03 max:0.10 -diff --git a/tools/aliexitsnoop.py b/tools/aliexitsnoop.py -new file mode 100755 -index 0000000..4f0b1a1 ---- /dev/null -+++ b/tools/aliexitsnoop.py -@@ -0,0 +1,302 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+from __future__ import print_function -+ -+import argparse -+import ctypes as ct -+import os -+import platform -+import re -+import signal -+import sys -+ -+from bcc import BPF -+from bcc.utils import printb -+from datetime import datetime -+from time import strftime -+ -+# -+# aliexitsnoop Trace all process termination (exit, fatal signal) -+# For Linux, uses BCC, eBPF. Embedded C. -+# -+# USAGE: aliexitsnoop [-h] [-x] [-t] [--utc] [--label[=LABEL]] [-p PID] -+# -+_examples = """examples: -+ aliexitsnoop # trace all process termination -+ aliexitsnoop -x # trace only fails, exclude exit(0) -+ aliexitsnoop -t # include timestamps (local time) -+ aliexitsnoop --utc # include timestamps (UTC) -+ aliexitsnoop -p 181 # only trace PID 181 -+ aliexitsnoop --label=exit # label each output line with 'exit' -+""" -+""" -+ Exit status (from ): -+ -+ 0 EX_OK Success -+ 2 argparse error -+ 70 EX_SOFTWARE syntax error detected by compiler, or -+ verifier error from kernel -+ 77 EX_NOPERM Need sudo (CAP_SYS_ADMIN) for BPF() system call -+ -+ More information about this script is in bcc/tools/aliexitsnoop_example.txt -+ -+ Copyright 2016 Netflix, Inc. -+ Copyright 2019 Instana, Inc. -+ Copyright 2019 Alibaba, Inc. -+ Licensed under the Apache License, Version 2.0 (the "License") -+ -+ 07-Feb-2016 Brendan Gregg (Netflix) Created execsnoop -+ 04-May-2019 Arturo Martin-de-Nicolas (Instana) Created exitsnoop -+ 13-May-2019 Jeroen Soeters (Instana) Refactor to import as module -+ 16-Jul-2019 Alvin Zheng (Alibaba) Created aliexitsnoop -+""" -+ -+def _getParser(): -+ parser = argparse.ArgumentParser( -+ description="Trace all process termination (exit, fatal signal)", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=_examples) -+ a = parser.add_argument -+ a("-t", "--timestamp", action="store_true", help="include timestamp (local time default)") -+ a("--utc", action="store_true", help="include timestamp in UTC (-t implied)") -+ a("-p", "--pid", help="trace this PID only") -+ a("--label", help="label each line") -+ a("-x", "--failed", action="store_true", help="trace only fails, exclude exit(0)") -+ # print the embedded C program and exit, for debugging -+ a("--ebpf", action="store_true", help=argparse.SUPPRESS) -+ # RHEL 7.6 keeps task->start_time as struct timespec, convert to u64 nanoseconds -+ a("--timespec", action="store_true", help=argparse.SUPPRESS) -+ return parser.parse_args -+ -+ -+class Global(): -+ parse_args = _getParser() -+ args = None -+ argv = None -+ bpf = None -+ SIGNUM_TO_SIGNAME = dict((v, re.sub("^SIG", "", k)) -+ for k, v in signal.__dict__.items() if re.match("^SIG[A-Z]+$", k)) -+ -+ -+class Data(ct.Structure): -+ """Event data matching struct data_t in _embedded_c().""" -+ _TASK_COMM_LEN = 16 # linux/sched.h -+ _pack_ = 1 -+ _fields_ = [ -+ ("start_time", ct.c_ulonglong), # task->start_time, see --timespec arg -+ ("exit_time", ct.c_ulonglong), # bpf_ktime_get_ns() -+ ("pid_tgid", ct.c_ulonglong), -+ ("pid", ct.c_uint), # task->tgid, thread group id == sys_getpid() -+ ("tid", ct.c_uint), # task->pid, thread id == sys_gettid() -+ ("ppid", ct.c_uint), # task->parent->tgid, notified of exit -+ ("exit_code", ct.c_int), -+ ("sig_info", ct.c_uint), -+ ("k_stack_id", ct.c_int), -+ ("u_stack_id", ct.c_int), -+ ("task", ct.c_char * _TASK_COMM_LEN) -+ ] -+ -+def _embedded_c(args): -+ """Generate C program for sched_process_exit tracepoint in kernel/exit.c.""" -+ c = """ -+ EBPF_COMMENT -+ #include -+ BPF_STATIC_ASSERT_DEF -+ BPF_STACK_TRACE(stack_traces,16384); -+ struct data_t { -+ u64 start_time; -+ u64 exit_time; -+ u64 pid_tgid; -+ u32 pid; -+ u32 tid; -+ u32 ppid; -+ int exit_code; -+ u32 sig_info; -+ int k_stack_id; -+ int u_stack_id; -+ char task[TASK_COMM_LEN]; -+ } __attribute__((packed)); -+ -+ BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA); -+ BPF_PERF_OUTPUT(events); -+ -+ int kprobe_do_exit(struct pt_regs * ctx, long exit_code) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ if (FILTER_PID || FILTER_EXIT_CODE) { return 0; } -+ -+ struct data_t data = { -+ .start_time = PROCESS_START_TIME_NS, -+ .exit_time = bpf_ktime_get_ns(), -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .pid = task->tgid, -+ .tid = task->pid, -+ .ppid = task->parent->tgid, -+ .exit_code = task->exit_code >> 8, -+ .sig_info = task->exit_code & 0xFF, -+ }; -+ bpf_get_current_comm(&data.task, sizeof(data.task)); -+ data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID); -+ data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_USER_STACK|BPF_F_REUSE_STACKID); -+ events.perf_submit(ctx, &data, sizeof(data)); -+ return 0; -+ } -+ """ -+ # TODO: this macro belongs in bcc/src/cc/export/helpers.h -+ bpf_static_assert_def = r""" -+ #ifndef BPF_STATIC_ASSERT -+ #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \ -+ extern int bpf_static_assert[(condition) ? 1 : -1] -+ #endif -+ """ -+ code_substitutions = [ -+ ('EBPF_COMMENT', '' if not Global.args.ebpf else _ebpf_comment()), -+ ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def), -+ ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))), -+ ('FILTER_PID', '0' if not Global.args.pid else "task->tgid != %s" % Global.args.pid), -+ ('FILTER_EXIT_CODE', '0' if not Global.args.failed else 'task->exit_code == 0'), -+ ('PROCESS_START_TIME_NS', 'task->start_time' if not Global.args.timespec else -+ '(task->start_time.tv_sec * 1000000000L) + task->start_time.tv_nsec'), -+ ] -+ for old, new in code_substitutions: -+ c = c.replace(old, new) -+ return c -+ -+def _ebpf_comment(): -+ """Return a C-style comment with information about the generated code.""" -+ comment = ('Created by %s at %s:\n\t%s' % -+ (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__)) -+ args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n') -+ return ("\n /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" % -+ (comment, ' '.join(Global.argv), args)) -+ .replace('\n', '\n\t*').replace('\t', ' ')) -+ -+def _print_header(): -+ if Global.args.timestamp: -+ title = 'TIME-' + ('UTC' if Global.args.utc else strftime("%Z")) -+ print("%-13s" % title, end="") -+ if Global.args.label is not None: -+ print("%-6s" % "LABEL", end="") -+ print("%-16s %-6s %-6s %-6s %-7s %-10s" % -+ ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE")) -+ -+def _print_event(cpu, data, size): # callback -+ """Print the exit event.""" -+ stack_traces = Global.bpf['stack_traces'] -+ e = ct.cast(data, ct.POINTER(Data)).contents -+ if Global.args.timestamp: -+ now = datetime.utcnow() if Global.args.utc else datetime.now() -+ print("%-13s" % (now.strftime("%H:%M:%S.%f")[:-3]), end="") -+ if Global.args.label is not None: -+ label = Global.args.label if len(Global.args.label) else 'exit' -+ print("%-6s" % label, end="") -+ age = (e.exit_time - e.start_time) / 1e9 -+ print("%-16s %-6d %-6d %-6d %-7.2f " % -+ (e.task.decode(), e.pid, e.ppid, e.tid, age), end="") -+ if e.sig_info == 0: -+ print("0" if e.exit_code == 0 else "code %d" % e.exit_code) -+ else: -+ sig = e.sig_info & 0x7F -+ if sig: -+ print("signal %d (%s)" % (sig, signum_to_signame(sig)), end="") -+ if e.sig_info & 0x80: -+ print(", core dumped ", end="") -+ print() -+ # print the kernel stack -+ if e.k_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.k_stack_id))) -+ for addr in cur_stack: -+ printb(b"Kernel Stack: %-16x %s" % (addr, Global.bpf.ksym(addr))) -+ else: -+ print("No KERNEL STACK FOUND") -+ # print the user stack -+ if e.u_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.u_stack_id))) -+ for addr in cur_stack: -+ printb(b"USER Stack: %-16x %s" % (addr, Global.bpf.sym(addr, e.pid_tgid))) -+ else: -+ print("No USER STACK FOUND") -+ print("=================================") -+ print() -+# ============================= -+# Module: These functions are available for import -+# ============================= -+def initialize(arg_list=sys.argv[1:]): -+ """Trace all process termination. -+ -+ arg_list - list of args, if omitted then uses command line args -+ arg_list is passed to argparse.ArgumentParser.parse_args() -+ -+ For example, if arg_list = [ '-x', '-t' ] -+ args.failed == True -+ args.timestamp == True -+ -+ Returns a tuple (return_code, result) -+ 0 = Ok, result is the return value from BPF() -+ 1 = args.ebpf is requested, result is the generated C code -+ os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message -+ os.EX_SOFTWARE: internal software error, result is error message -+ """ -+ Global.argv = arg_list -+ Global.args = Global.parse_args(arg_list) -+ if Global.args.utc and not Global.args.timestamp: -+ Global.args.timestamp = True -+ if not Global.args.ebpf and os.geteuid() != 0: -+ return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call") -+ if re.match('^3\.10\..*el7.*$', platform.release()): # Centos/Red Hat -+ Global.args.timespec = True -+ for _ in range(2): -+ c = _embedded_c(Global.args) -+ if Global.args.ebpf: -+ return (1, c) -+ try: -+ return (os.EX_OK, BPF(text=c)) -+ except Exception as e: -+ error = format(e) -+ if (not Global.args.timespec and error.find('struct timespec') and error.find('start_time')): -+ print('This kernel keeps task->start_time in a struct timespec.\n' + -+ 'Retrying with --timespec') -+ Global.args.timespec = True -+ continue -+ return (os.EX_SOFTWARE, "BPF error: " + error) -+ except: -+ return (os.EX_SOFTWARE, "Unexpected error: {0}".format(sys.exc_info()[0])) -+ -+def snoop(bpf, event_handler): -+ """Call event_handler for process termination events. -+ -+ bpf - result returned by successful initialize() -+ event_handler - callback function to handle termination event -+ args.pid - Return after event_handler is called, only monitoring this pid -+ """ -+ bpf["events"].open_perf_buffer(event_handler) -+ while True: -+ bpf.perf_buffer_poll() -+ if Global.args.pid: -+ return -+ -+def signum_to_signame(signum): -+ """Return the name of the signal corresponding to signum.""" -+ return Global.SIGNUM_TO_SIGNAME.get(signum, "unknown") -+ -+# ============================= -+# Script: invoked as a script -+# ============================= -+def main(): -+ try: -+ rc, buffer = initialize() -+ Global.bpf = buffer -+ Global.bpf.attach_kprobe(event="do_exit", fn_name="kprobe_do_exit") -+ if rc: -+ print(buffer) -+ sys.exit(0 if Global.args.ebpf else rc) -+ _print_header() -+ snoop(buffer, _print_event) -+ except KeyboardInterrupt: -+ print() -+ sys.exit() -+ -+ return 0 -+ -+if __name__ == '__main__': -+ main() -diff --git a/tools/aliexitsnoop_example.txt b/tools/aliexitsnoop_example.txt -new file mode 100644 -index 0000000..4b60929 ---- /dev/null -+++ b/tools/aliexitsnoop_example.txt -@@ -0,0 +1,78 @@ -+Demonstrations of aliexitsnoop. -+ -+This Linux tool is actually an enhanced version of the exitsnoop. For more details about -+exitsnoop, see exitsnoop_example.txt. Compared to the exitsnoop, aliexitsnoop not only print -+the exit code, pid, ppid, lifetime of the exiting processes, but also the stack of the exiting -+processes. The demand stems from the fact that in the production environment, some processes -+end up unexpectedly. Therefore we need to dump the stack of the target processes to find out -+what caused the target process to exit. -+ -+Usage: -+aliexitsnoop.py [-h] [-t] [--utc] [-p PID] [--label LABEL] [-x] -+ -+Trace all process termination (exit, fatal signal) -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -t, --timestamp include timestamp (local time default) -+ --utc include timestamp in UTC (-t implied) -+ -p PID, --pid PID trace this PID only -+ --label LABEL label each line -+ -x, --failed trace only fails, exclude exit(0) -+ -+examples: -+ aliexitsnoop # trace all process termination -+ aliexitsnoop -x # trace only fails, exclude exit(0) -+ aliexitsnoop -t # include timestamps (local time) -+ aliexitsnoop --utc # include timestamps (UTC) -+ aliexitsnoop -p 181 # only trace PID 181 -+ aliexitsnoop --label=exit # label each output line with 'exit' -+ -+The following example shows the output example when run in a loop that is interrupted with -+Ctrl-C from the terminal: -+ -+grep 25099 25087 25099 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7fd29b10c660 [unknown] -+USER Stack: 7fd29b0611d9 [unknown] -+================================= -+ -+id 25101 25100 25101 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7f932c07b660 [unknown] -+USER Stack: 7f932bfd01d9 [unknown] -+================================= -+ -+bash 25100 25087 25100 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7f5318862660 [unknown] -+USER Stack: 7f53187b71d9 [unknown] -+================================= -+ -+systemd 1 0 25102 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083a97 __x64_sys_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: 7f92232c1e64 [unknown] -+ -+We can see that aliexitsnoop prints the user stack and kernel stack of the exiting processes. -+What needs to be noted is that aliexitsnoop can only get the address track of the user stack. -+The reason why aliexitsnoop cannot get the symbols corresponding the addresses is that the -+user space of the process does not exists after the process ends. However, the address track -+of the user stack still helps, because you can analyze together with /proc/$pid/maps. -diff --git a/tools/aliext4writeslower.py b/tools/aliext4writeslower.py -new file mode 100755 -index 0000000..db626c7 ---- /dev/null -+++ b/tools/aliext4writeslower.py -@@ -0,0 +1,411 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# aliext4writeslower Trace slow ext4 write operation. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: aliext4writeslower [-h] [-p PID] [-d device] [min_ms] -+# -+# This script traces key steps of write(2) which maybe time-consuming, -+# here trace 4 key steps: jbd2 handle start(it may need jbd2 trancation -+# checkpoint or commit), page cache allocation, block allocation and -+# balance dirty pages, and we also take into account sched delay. -+# -+# With this tool, we can clearly know write(2)'s latency proportion. -+# -+# WARNING: This adds low-overhead instrumentation to these ext4 operations, -+# including reads and writes from the file system cache. Such reads and writes -+# can be very frequent (depending on the workload; eg, 1M/sec), at which -+# point the overhead of this tool (even if it prints no "slower" events) can -+# begin to become significant. -+# -+# By default, a minimum millisecond threshold of 10 is used. -+# -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019-08-29 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import argparse -+from time import strftime -+import ctypes as ct -+ -+# symbols -+kallsyms = "/proc/kallsyms" -+ -+# arguments -+examples = """examples: -+ ./aliext4writeslower # trace write(2) slower than 10 ms (default) -+ ./aliext4writeslower 1 # trace write(2) slower than 1 ms -+ ./aliext4writeslower 0 # trace all operations (warning: verbose) -+ ./aliext4writeslower -d sda5 # inspect specified device /dev/sda5 containing ext4 fs -+ ./aliext4writeslower -p 185 # trace PID 185 only -+""" -+parser = argparse.ArgumentParser( -+ description="Trace common ext4 write(2) slower than a threshold", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-p", "--pid", help="trace this PID only") -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("min_ms", nargs="?", default='10', -+ help="minimum I/O duration to trace, in ms (default 10)") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+args = parser.parse_args() -+min_ms = int(args.min_ms) -+pid = args.pid -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+ -+enum { -+ TRACE_PAGECACHE_ALLOC = 0, -+ TRACE_JOURNAL_START, -+ TRACE_BLOCK_ALLOC, -+ TRACE_BALANCE_DIRTY, -+ TRACE_WRITE_BEGIN, -+ TRACE_WRITE_END, -+ TRACE_TYPE_MAX, -+}; -+ -+struct val_t { -+ dev_t dev; -+ u64 ts; -+ u64 sched_delay; -+ u64 trace_ts[TRACE_TYPE_MAX]; -+ u64 trace_time[TRACE_TYPE_MAX]; -+ u64 offset; -+ struct file *fp; -+}; -+ -+struct data_t { -+ dev_t dev; -+ u64 size; -+ u64 offset; -+ u64 delta_us; -+ u64 pagecache_alloc_us; -+ u64 journal_start_us; -+ u64 block_alloc_us; -+ u64 balance_dirty_us; -+ u64 write_begin_us; -+ u64 write_end_us; -+ u64 sched_delay; -+ u64 pid; -+ char task[TASK_COMM_LEN]; -+ char file[DNAME_INLINE_LEN]; -+}; -+ -+BPF_HASH(entryinfo, u64, struct val_t); -+BPF_PERF_OUTPUT(events); -+ -+ -+// ext4_file_write_iter(): -+int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) -+{ -+ struct val_t val = {}; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ struct task_struct *p = (struct task_struct *)bpf_get_current_task(); -+ struct inode *inode = iocb->ki_filp->f_inode; -+ dev_t dev = inode->i_sb->s_bdev->bd_dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ if (FILTER_PID) -+ return 0; -+ -+ val.ts = bpf_ktime_get_ns(); -+ val.sched_delay = p->sched_info.run_delay; -+ val.fp = iocb->ki_filp; -+ val.dev = dev; -+ val.offset = iocb->ki_pos; -+ if (val.fp) -+ entryinfo.update(&id, &val); -+ -+ return 0; -+} -+ -+int trace_write_return(struct pt_regs *ctx) -+{ -+ struct val_t *valp; -+ u64 ts, delta_us, id = bpf_get_current_pid_tgid(); -+ u32 size, pid = id >> 32; -+ struct task_struct *p = (struct task_struct *)bpf_get_current_task(); -+ struct data_t data = {}; -+ -+ if (FILTER_PID) -+ return 0; -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) -+ return 0; -+ -+ ts = bpf_ktime_get_ns(); -+ delta_us = (ts - valp->ts) / 1000; -+ entryinfo.delete(&id); -+ if (FILTER_US) -+ return 0; -+ -+ size = PT_REGS_RC(ctx); -+ data.size = size; -+ data.delta_us = delta_us; -+ data.pid = pid; -+ data.sched_delay = (p->sched_info.run_delay - valp->sched_delay) / 1000; -+ data.offset = valp->offset; -+ data.dev = valp->dev; -+ data.pagecache_alloc_us = valp->trace_time[TRACE_PAGECACHE_ALLOC] / 1000; -+ data.journal_start_us = valp->trace_time[TRACE_JOURNAL_START] / 1000; -+ data.block_alloc_us = valp->trace_time[TRACE_BLOCK_ALLOC] / 1000; -+ data.balance_dirty_us = valp->trace_time[TRACE_BALANCE_DIRTY] / 1000; -+ data.write_begin_us = valp->trace_time[TRACE_WRITE_BEGIN] / 1000; -+ data.write_end_us = valp->trace_time[TRACE_WRITE_END] / 1000; -+ bpf_get_current_comm(&data.task, sizeof(data.task)); -+ -+ // workaround (rewriter should handle file to d_name in one step): -+ struct dentry *de = NULL; -+ struct qstr qs = {}; -+ bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry); -+ bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name); -+ if (qs.len == 0) -+ return 0; -+ bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name); -+ -+ // output -+ events.perf_submit(ctx, &data, sizeof(data)); -+ -+ return 0; -+} -+ -+ -+static int trace_type_entry(struct pt_regs *ctx, int type) -+{ -+ struct val_t *valp; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ u64 ts = bpf_ktime_get_ns(); -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) { -+ // missed tracing issue or filtered -+ return 0; -+ } -+ -+ valp->trace_ts[type] = ts; -+ return 0; -+} -+ -+static int trace_type_return(struct pt_regs *ctx, int type) -+{ -+ struct val_t *valp; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ u64 ts = bpf_ktime_get_ns(); -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) { -+ // missed tracing issue or filtered -+ return 0; -+ } -+ -+ valp->trace_time[type] += ts - valp->trace_ts[type];; -+ return 0; -+} -+ -+int trace_pagecache_alloc_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_PAGECACHE_ALLOC); -+} -+ -+int trace_pagecache_alloc_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_PAGECACHE_ALLOC); -+} -+ -+int trace_journal_start_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_JOURNAL_START); -+} -+ -+int trace_journal_start_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_JOURNAL_START); -+} -+ -+int trace_block_alloc_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_BLOCK_ALLOC); -+} -+ -+int trace_block_alloc_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_BLOCK_ALLOC); -+} -+ -+int trace_balance_dirty_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_BALANCE_DIRTY); -+} -+ -+int trace_balance_dirty_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_BALANCE_DIRTY); -+} -+ -+int trace_aops_write_begin_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_WRITE_BEGIN); -+} -+ -+int trace_aops_write_begin_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_WRITE_BEGIN); -+} -+ -+int trace_aops_write_end_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_WRITE_END); -+} -+ -+int trace_aops_write_end_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_WRITE_END); -+} -+""" -+ -+devid2name={} -+num_trans=0 -+ -+def init_dev_name(): -+ global devid2name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ devid2name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+ -+name2devid={v:k for k,v in devid2name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if min_ms == 0: -+ bpf_text = bpf_text.replace('FILTER_US', '0') -+else: -+ bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_ms * 1000)) -+ -+if args.pid: -+ bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) -+else: -+ bpf_text = bpf_text.replace('FILTER_PID', '0') -+ -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# kernel->user event data: struct data_t -+DNAME_INLINE_LEN = 32 # linux/dcache.h -+TASK_COMM_LEN = 16 # linux/sched.h -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("size", ct.c_ulonglong), -+ ("offset", ct.c_ulonglong), -+ ("delta_us", ct.c_ulonglong), -+ ("pagecache_alloc_us", ct.c_ulonglong), -+ ("journal_start_us", ct.c_ulonglong), -+ ("block_alloc_us", ct.c_ulonglong), -+ ("balance_dirty_us", ct.c_ulonglong), -+ ("write_begin_us", ct.c_ulonglong), -+ ("write_end_us", ct.c_ulonglong), -+ ("sched_delay", ct.c_ulonglong), -+ ("pid", ct.c_ulonglong), -+ ("task", ct.c_char * TASK_COMM_LEN), -+ ("file", ct.c_char * DNAME_INLINE_LEN) -+ ] -+ -+# process event -+def print_event(cpu, data, size): -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ -+ print("%-8s %-8s %-14.14s %-6s %-7s %-8d %7.2f %8.2f %10.2f %11.2f %13.2f %11.2f %9.2f %11.2f %s" % -+ (strftime("%H:%M:%S"), devid2name[event.dev], -+ event.task.decode(), event.pid, event.size, event.offset / 1024, -+ float(event.delta_us) / 1000, float(event.pagecache_alloc_us) / 1000, -+ float(event.journal_start_us) / 1000, float(event.block_alloc_us) / 1000, -+ float(event.balance_dirty_us) / 1000, float(event.write_begin_us) / 1000, -+ float(event.write_end_us) / 1000, float(event.sched_delay) / 1000, -+ event.file.decode())) -+ -+# initialize BPF -+b = BPF(text=bpf_text) -+ -+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry") -+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") -+ -+# trace address_space_operations .write_begin() -+b.attach_kprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_entry") -+b.attach_kprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_entry") -+b.attach_kretprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_return") -+b.attach_kretprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_return") -+ -+# trace address_space_operations .write_end() -+b.attach_kprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_entry") -+b.attach_kprobe(event="ext4_write_end", fn_name="trace_aops_write_end_entry") -+b.attach_kretprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_return") -+b.attach_kretprobe(event="ext4_write_end", fn_name="trace_aops_write_end_return") -+ -+# trace page cache allocation -+b.attach_kprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_entry") -+b.attach_kretprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_return") -+ -+# trace journal start -+b.attach_kprobe(event="jbd2__journal_start", fn_name="trace_journal_start_entry") -+b.attach_kretprobe(event="jbd2__journal_start", fn_name="trace_journal_start_return") -+ -+# trace block allocation -+b.attach_kprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_entry") -+b.attach_kretprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_return") -+b.attach_kprobe(event="_ext4_get_block", fn_name="trace_block_alloc_entry") -+b.attach_kretprobe(event="_ext4_get_block", fn_name="trace_block_alloc_return") -+ -+# trace balance dirty pages -+b.attach_kprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_entry") -+b.attach_kretprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_return") -+ -+# header -+if min_ms == 0: -+ print("Tracing ext4 operations") -+else: -+ print("Tracing ext4 operations slower than %d ms" % min_ms) -+ -+print("%-8s %-8s %-14s %-6s %-7s %-8s %7s %8s %10s %11s %13s %11s %9s %11s %s" % ("TIME", -+ "DEVICE", "COMM", "PID", "BYTES", "OFF_KB", "LAT(ms)", "pg_alloc", "jbd2_start", -+ "block_alloc", "balance_dirty", "write_begin", "write_end", "sched_delay", "FILENAME")) -+ -+# read events -+b["events"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/aliext4writeslower_example.txt b/tools/aliext4writeslower_example.txt -new file mode 100644 -index 0000000..b702d9d ---- /dev/null -+++ b/tools/aliext4writeslower_example.txt -@@ -0,0 +1,68 @@ -+Demonstrations of aliext4writeslower. -+ -+This script traces key steps of write(2) which maybe time-consuming, -+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction -+checkpoint or commit), page cache allocation, block allocation and -+balance dirty pages, and we also take into account sched delay. -+With this tool, we can clearly know write(2)'s latency proportion. -+ -+Usage: -+usage: aliext4writeslower [-h] [-p PID] [-d DEVICE] [min_ms] -+ -+Trace common ext4 write(2) slower than a threshold -+ -+positional arguments: -+ min_ms minimum I/O duration to trace, in ms (default 10) -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -p PID, --pid PID trace this PID only -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -+examples: -+ ./aliext4writeslower # trace write(2) slower than 10 ms (default) -+ ./aliext4writeslower 1 # trace write(2) slower than 1 ms -+ ./aliext4writeslower 0 # trace all operations (warning: verbose) -+ ./aliext4writeslower -d sda5 # inspect specified device /dev/sda5 containing ext4 fs -+ ./aliext4writeslower -p 185 # trace PID 185 only -+ -+ -+Output example: -+ -+Example 1: -+# Trace write(2) slower than 5 ms -+python aliext4writeslower.py 5 -+ -+Tracing ext4 operations slower than 5 ms -+TIME DEVICE COMM PID BYTES OFF_KB LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME -+17:14:07 sda5 dd 31149 1048576 473088 5.15 1.02 0.68 0.25 0.54 2.50 1.57 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 474112 5.27 1.07 0.68 0.27 0.54 2.58 1.59 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 475136 5.09 1.01 0.69 0.30 0.53 2.52 1.49 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 607232 5.12 1.09 0.67 0.21 0.54 2.52 1.50 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 637952 6.88 1.20 1.15 0.34 0.75 3.38 1.97 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 674816 5.29 1.17 0.68 0.20 0.55 2.60 1.58 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 738304 5.08 0.99 0.69 0.27 0.54 2.46 1.54 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 790528 5.03 0.94 0.69 0.25 0.54 2.42 1.50 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 803840 5.04 0.96 0.67 0.29 0.54 2.45 1.51 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 870400 5.15 1.10 0.68 0.19 0.55 2.52 1.53 0.00 testfile -+17:14:09 sda5 dd 31149 1048576 986112 5.08 1.00 0.69 0.27 0.54 2.48 1.51 0.00 testfile -+ -+ -+Example 2: -+# Inspect specified device /dev/sda5 containing ext4 fs -+python aliext4writeslower.py -d sda5 -+ -+Tracing ext4 operations -+TIME DEVICE COMM PID BYTES OFF_KB LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME -+17:16:24 sda3 su 32894 292 0 0.03 0.00 0.00 0.00 0.00 0.01 0.00 0.00 lastlog -+17:16:24 sda3 syslog-ng 990 67 456973 0.07 0.01 0.01 0.00 0.00 0.02 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 106 456973 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 67 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 syslog-ng 990 106 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 ilogtail 56674 199 3296 0.04 0.00 0.01 0.00 0.00 0.01 0.01 0.00 000147.log -+17:16:24 sda3 syslog-ng 990 95 456974 0.04 0.01 0.01 0.00 0.00 0.01 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 95 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 xperf 2558 874 403 0.03 0.00 0.01 0.00 0.00 0.01 0.01 0.00 xperf.pmu.data.0 -+17:16:25 sda3 xperf 2558 819 403 0.06 0.01 0.01 0.00 0.01 0.02 0.01 0.00 xperf.pmu.data.0 -+17:16:25 sda3 syslog-ng 990 175 456974 0.05 0.01 0.01 0.00 0.00 0.01 0.01 0.00 secure-20190805 -diff --git a/tools/alihardirqs.py b/tools/alihardirqs.py -new file mode 100755 -index 0000000..b8b0500 ---- /dev/null -+++ b/tools/alihardirqs.py -@@ -0,0 +1,219 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alihardirqs Summarize hard IRQ (interrupt) event time. -+# Enhanced version of BCC hardirqs. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alihardirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alihardirqs # sum hard irq event time -+ ./alihardirqs -D # show hard irq event time as histograms -+ ./alihardirqs -c 0 # show hard irq event time of CPU 0 only -+ ./alihardirqs -i 25 # show event time of IRQ 25 only -+ ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alihardirqs 1 10 # print 1 second summaries, 10 times -+ ./alihardirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize hard irq statistics, including total count, time, etc.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="output hardIRQ statistics on specific CPU only") -+parser.add_argument("-i", "--irq", -+ help="output statistics of specific hardIRQ") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+distmode = "1" if args.dist else "0" -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct irq_key { -+ u32 cpu; -+ u32 irq; -+ char name[32]; -+} irq_key_t; -+ -+typedef struct irq_key_slot { -+ irq_key_t key; -+ u64 slot; -+} irq_key_slot_t; -+ -+typedef struct irq_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} irq_val_t; -+ -+typedef struct stamp { -+ u64 ts; -+ char name[32]; -+} stamp_t; -+ -+BPF_PERCPU_ARRAY(start, stamp_t, 1); -+BPF_HASH(res, irq_key_t, irq_val_t); -+BPF_HISTOGRAM(dist, irq_key_slot_t); -+ -+ -+TRACEPOINT_PROBE(irq, irq_handler_entry) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ stamp_t stamp = { -+ .ts = bpf_ktime_get_ns(), -+ }; -+ TP_DATA_LOC_READ_CONST(stamp.name, name, sizeof(stamp.name)); -+ start.update(&idx, &stamp); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(irq, irq_handler_exit) -+{ -+ u64 delta; -+ u32 cpu = bpf_get_smp_processor_id(); -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ stamp_t *stampp = start.lookup(&idx); -+ if (!stampp || stampp->ts == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - stampp->ts; -+ irq_key_t key = {.cpu = cpu, .irq = args->irq}; -+ char *name = (char *)stampp->name; -+ bpf_probe_read(key.name, sizeof(key.name), name); -+ -+#if DIST_MODE -+ irq_key_slot_t key_slot = { -+ .key = key, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+#else -+ struct irq_val *valp, val; -+ valp = res.lookup(&key); -+ -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+#endif -+ stamp_t zero = {}; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_filter = '' -+cpu_filter = "1" -+if args.cpu: -+ cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+irq_filter = "1" -+if args.irq: -+ irq_filter = "args->irq == %s" % args.irq -+ -+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter) -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('FACTOR', "%d" % factor) -+bpf_text = bpf_text.replace('DIST_MODE', distmode) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+def print_section(key): -+ return "CPU %d, hardIRQ %d (%s)" % (key[0], key[1], key[2].decode('utf-8', 'replace')) -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing hard irq event time... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ tab.print_log2_hist("time_" + label, "section", -+ section_print_fn=print_section, -+ bucket_fn=lambda k: (k.cpu, k.irq, k.name), -+ bucket_sort_fn=lambda keys: sorted(keys)) -+ else: -+ cur_cpu = -1 -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %20s %10s %10s %10s" % -+ ("HARDIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label)) -+ cur_cpu = k.cpu -+ -+ print("%10d %20s %10d %10d %10d" % -+ (k.irq, k.name.decode('utf-8', 'replace'), v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alihardirqs_example.txt b/tools/alihardirqs_example.txt -new file mode 100644 -index 0000000..caebb47 ---- /dev/null -+++ b/tools/alihardirqs_example.txt -@@ -0,0 +1,93 @@ -+Demonstrations of alihardirqs, the enhanced version of BCC hardirqs. -+ -+This program traces hard interrupts (irqs), and stores timing statistics -+in-kernel for efficiency. -+ -+ -+Example 1 -+ -+By default, all statistics of all hardirq on all CPUs are sampled, -+and the output is displayed in a per-hardirq and per-CPU flavor. -+ -+# alihardirqs -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 2 4 2 -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-hardirq flavor. -+ -+# alihardirqs -c 3 -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 3 10 7 -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified hardirq only with "-i" option, -+and the output will be displayed in a per-CPU flavor. -+ -+# alihardirqs -i 31 -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 3 8 6 -+ -+ -+Example 4 -+ -+The distribution of hardirq run time can be printed as a histogram with "-D" -+option. The output will be displayed in a per-hardirq and per-CPU flavor. -+ -+# alihardirqs -D -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+section = CPU 3, hardIRQ 31 (virtio0-input.0) -+ time_usecs : count distribution -+ 0 -> 1 : 1 |****************************************| -+ 2 -> 3 : 0 | | -+ 4 -> 7 : 1 |****************************************| -+ -+ -+ -+USAGE message: -+ -+# alihardirqs -h -+usage: alihardirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] -+ [interval] [outputs] -+ -+Summarize hard irq statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU output hardIRQ statistics on specific CPU only -+ -i IRQ, --irq IRQ output statistics of specific hardIRQ -+ -+examples: -+ ./alihardirqs # sum hard irq event time -+ ./alihardirqs -D # show hard irq event time as histograms -+ ./alihardirqs -c 0 # show hard irq event time of CPU 0 only -+ ./alihardirqs -i 25 # show event time of IRQ 25 only -+ ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alihardirqs 1 10 # print 1 second summaries, 10 times -+ ./alihardirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alijbd2stat.py b/tools/alijbd2stat.py -new file mode 100755 -index 0000000..f727fc1 ---- /dev/null -+++ b/tools/alijbd2stat.py -@@ -0,0 +1,392 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alijbd2stat Summarize filesystem jbd2 stat. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alijbd2stat [-h] [-t thresh] [-d device] -+# -+# Copyright (c) 2019-2021 Alibaba Group. -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019/08/05 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import ctypes as ct -+import argparse -+import time -+ -+# arguments -+examples = """examples: -+ ./alijbd2stat # summarize filesystem jbd2 stat -+ ./alijbd2stat -d sda3 # inspect specified device /dev/sda3 -+ ./alijbd2stat -t 10 # show jbd2 handle's context when it runs more than 10ms -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize filesystem jbd2 stat", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("-t", "--thresh_time", nargs="?", default=0, -+ help="show jbd2 handle's context when its run time is greater than this value") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+args = parser.parse_args() -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+#include -+ -+struct handle_stat { -+ dev_t dev; -+ u32 type; -+ u32 line_no; -+ unsigned long tid; -+ unsigned long pre_start_time; -+ unsigned long start_time; -+ unsigned long end_time; -+ u64 sched_delay; -+ u64 kernel_ip; -+ int kernel_stack_id; -+ u32 pid; -+ char comm[TASK_COMM_LEN]; -+}; -+ -+struct transaction_run_stat { -+ dev_t dev; -+ u32 handle_count; -+ u32 blocks; -+ u32 blocks_logged; -+ unsigned long tid; -+ unsigned long wait; -+ unsigned long request_delay; -+ unsigned long running; -+ unsigned long locked; -+ unsigned long flushing; -+ unsigned long logging; -+}; -+ -+/* -+struct handle_info_per_type { -+ u32 count -+}; -+*/ -+ -+BPF_HASH(handle_stat_map, struct task_struct *, struct handle_stat); -+BPF_STACK_TRACE(stack_traces, 1024); -+BPF_PERF_OUTPUT(jbd2_handle_stat); -+BPF_PERF_OUTPUT(transaction_stat); -+ -+ -+int trace_start_this_handle(struct pt_regs *ctx, journal_t *journal, -+ handle_t *handle, gfp_t gfp_mask) -+{ -+ struct handle_stat s; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ dev_t dev = journal->j_fs_dev->bd_dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ memset(&s, 0, sizeof(struct handle_stat)); -+ s.dev = dev; -+ s.pre_start_time = bpf_ktime_get_ns(); -+ s.sched_delay = t->sched_info.run_delay; -+ -+ handle_stat_map.update(&t, &s); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(jbd2, jbd2_handle_start) -+{ -+ struct handle_stat *s; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ s = handle_stat_map.lookup(&t); -+ if (s == NULL) -+ return 0; -+ -+ s->start_time = bpf_ktime_get_ns(); -+ s->type = args->type; -+ s->line_no = args->line_no; -+ s->tid = args->tid; -+ return 0; -+} -+ -+int trace_jbd2_journal_stop(struct pt_regs *ctx, handle_t *handle) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal; -+ struct handle_stat *s; -+ struct handle_stat s2 = {}; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ unsigned long end_time = bpf_ktime_get_ns(); -+ unsigned long run_time; -+ -+ if (transaction == NULL) { -+ handle_stat_map.delete(&t); -+ return 0; -+ } -+ -+ journal = transaction->t_journal; -+ dev_t dev = journal->j_fs_dev->bd_dev; -+ if (FILTER_DEV) -+ return 0; -+ -+ if (handle->h_ref >= 2) -+ return 0; -+ -+ s = handle_stat_map.lookup(&t); -+ if (s == NULL) -+ return 0; -+ -+ /* -+ * This could happen, jbd2_journal_start_reserved() miss a trace -+ * jbd2_handle_start(). -+ */ -+ if (!s->start_time) -+ s->start_time = s->pre_start_time; -+ -+ -+ run_time = end_time - s->pre_start_time; -+ if (run_time < FILTER_THRESH_TIME * 1000 * 1000LLU) { -+ handle_stat_map.delete(&t); -+ return 0; -+ } -+ -+ s2.dev = s->dev; -+ s2.type = s->type; -+ s2.line_no = s->line_no; -+ s2.tid = s->tid; -+ s2.pre_start_time = s->pre_start_time; -+ s2.start_time = s->start_time; -+ s2.end_time = end_time; -+ s2.pid = bpf_get_current_pid_tgid(); -+ s2.sched_delay = t->sched_info.run_delay - s->sched_delay; -+ bpf_get_current_comm(&s2.comm, sizeof(s2.comm)); -+ -+ s2.kernel_stack_id = stack_traces.get_stackid(ctx, 0); -+ if (s2.kernel_stack_id >= 0) { -+ // populate extras to fix the kernel stack -+ u64 ip = PT_REGS_IP(ctx); -+ u64 page_offset; -+ -+ // if ip isn't sane, leave key ips as zero for later checking -+ #if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) -+ // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it -+ page_offset = __PAGE_OFFSET_BASE; -+ #elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) -+ // x64, 4.17, and later -+ #if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) -+ page_offset = __PAGE_OFFSET_BASE_L5; -+ #else -+ page_offset = __PAGE_OFFSET_BASE_L4; -+ #endif -+ #else -+ // earlier x86_64 kernels, e.g., 4.6, comes here -+ // arm64, s390, powerpc, x86_32 -+ page_offset = PAGE_OFFSET; -+ #endif -+ -+ if (ip > page_offset) -+ s2.kernel_ip = ip; -+ } -+ -+ jbd2_handle_stat.perf_submit(ctx, &s2, sizeof(struct handle_stat)); -+ handle_stat_map.delete(&t); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(jbd2, jbd2_run_stats) -+{ -+ struct transaction_run_stat s; -+ dev_t dev = args->dev;; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ memset(&s, 0, sizeof(struct transaction_run_stat)); -+ s.dev = args->dev; -+ s.tid = args->tid; -+ s.wait = args->wait; -+ s.request_delay = args->request_delay; -+ s.running = args->running; -+ s.locked = args->locked; -+ s.flushing = args->flushing; -+ s.logging = args->logging; -+ s.handle_count = args->handle_count; -+ s.blocks = args->blocks; -+ s.blocks_logged = args->blocks_logged; -+ -+ transaction_stat.perf_submit(args, &s, sizeof(struct transaction_run_stat)); -+ return 0; -+} -+ -+""" -+devid2name={} -+num_trans=0 -+ -+def init_dev_name(): -+ global devid2name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ devid2name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+ -+name2devid={v:k for k,v in devid2name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if args.thresh_time: -+ bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '%s' % args.thresh_time) -+else: -+ bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '9999') -+ -+# code substitutions -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# load BPF program -+b = BPF(text=bpf_text) -+ -+b.attach_kprobe(event="start_this_handle", fn_name="trace_start_this_handle") -+b.attach_kprobe(event="jbd2_journal_stop", fn_name="trace_jbd2_journal_stop") -+ -+print("Tracing jbd2 stats... Hit Ctrl-C to end.") -+ -+TASK_COMM_LEN = 16 -+ -+class HandleStat(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("type", ct.c_uint), -+ ("line_no", ct.c_uint), -+ ("tid", ct.c_ulonglong), -+ ("pre_start_time", ct.c_ulonglong), -+ ("start_time", ct.c_ulonglong), -+ ("end_time", ct.c_ulonglong), -+ ("sched_delay", ct.c_ulonglong), -+ ("kernel_ip", ct.c_ulonglong), -+ ("kernel_stack_id", ct.c_int), -+ ("pid", ct.c_uint), -+ ("comm", ct.c_char * TASK_COMM_LEN), -+ ] -+ -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("handle_count", ct.c_uint), -+ ("blocks", ct.c_uint), -+ ("blocks_logged", ct.c_uint), -+ ("tid", ct.c_ulonglong), -+ ("wait", ct.c_ulonglong), -+ ("request_delay", ct.c_ulonglong), -+ ("running", ct.c_ulonglong), -+ ("locked", ct.c_ulonglong), -+ ("flushing", ct.c_ulonglong), -+ ("logging", ct.c_ulonglong), -+ ] -+ -+class trans_stat: -+ def __init__(self, dev): -+ self.dev=dev -+ self.handle_count=0 -+ self.blocks=0 -+ self.blocks_logged=0 -+ self.tid=0 -+ slef.wait=0 -+ self.request_delay=0 -+ self.running=0 -+ self.locked=0 -+ self.flushing=0 -+ self.logging=0 -+ -+ -+def stack_id_err(stack_id): -+ # -EFAULT in get_stackid normally means the stack-trace is not available, -+ # Such as getting kernel stack trace in userspace code -+ return (stack_id < 0) and (stack_id != -errno.EFAULT) -+ -+def aksym(addr): -+ return b.ksym(addr) + "_[k]".encode() -+ -+def print_stack(cpu, data, size): -+ k = ct.cast(data, ct.POINTER(HandleStat)).contents -+ -+ kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id) -+ -+ # fix kernel stack -+ kernel_stack = [] -+ if k.kernel_stack_id >= 0: -+ for addr in kernel_tmp: -+ kernel_stack.append(addr) -+ # the later IP checking -+ if k.kernel_ip: -+ kernel_stack.insert(0, k.kernel_ip) -+ -+ # print default multi-line stack output -+ if stack_id_err(k.kernel_stack_id): -+ print(" [Missed Kernel Stack]") -+ else: -+ print("comm: %s pid: %u tid: %lu pre_start_time: %lu run_time: %lu sched_delay: %lu" % -+ (k.comm, k.pid, k.tid, (k.start_time - k.pre_start_time) / 1000, -+ (k.end_time - k.pre_start_time) / 1000, k.sched_delay / 1000 / 1000)) -+ for addr in kernel_stack: -+ print(" %s" % aksym(addr)) -+ #print(" %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid)) -+ #print() -+ -+def print_header(): -+ print("\n%-8s %-12s %-4s %-13s %-7s %-6s %-8s %-7s %-12s %-6s %-13s" % ("dev", -+ "tid", "wait", "request_delay", "running", "locked", "flushing", "logging", -+ "handle_count", "blocks", "blocks_logged")) -+ -+# process event -+def print_event(cpu, data, size): -+ global num_trans -+ -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ if (num_trans % 10) == 0: -+ print_header() -+ -+ print("%-8s %-12lu %-4lu %-13lu %-7lu %-6lu %-8lu %-7lu %-12lu %-6lu %-13lu" % -+ (devid2name[event.dev], event.tid, event.wait, event.request_delay, event.running, -+ event.locked, event.flushing, event.logging, event.handle_count, event.blocks, -+ event.blocks_logged)) -+ num_trans = num_trans + 1 -+ -+label="usecs" -+stack_traces = b["stack_traces"] -+b["jbd2_handle_stat"].open_perf_buffer(print_stack, page_cnt=64) -+b["transaction_stat"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alijbd2stat_example.txt b/tools/alijbd2stat_example.txt -new file mode 100644 -index 0000000..a65f12e ---- /dev/null -+++ b/tools/alijbd2stat_example.txt -@@ -0,0 +1,83 @@ -+Demonstrations of alijbd2stat. -+Alijbd2stat can trace every jbd2 handle and every transaction commit -+info. If one jbd2 handle's running time is greater than user specified -+threshold time, alijbd2stat will catch this handle's context. -+ -+Usage: -+alijbd2stat [-h] [-d DEVICE] [-t [THRESH_TIME]] -+ -+Summarize filesystem jbd2 stat -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -t [THRESH_TIME], --thresh_time [THRESH_TIME] -+ show jbd2 handle's context when its run time is -+ greater than this value -+ -+examples: -+ ./alijbd2stat # show jbd2 transaction commit info. -+ ./alijbd2stat -d sda3 # inspect specified device /dev/sda3. -+ ./alijbd2stat -t 10 # show jbd2 handle's context when it runs more than 10ms. -+ -+ -+Output example: -+ -+Example 1: -+# Show jbd2 transaction commit info. -+python alijbd2stat.py -+ -+dev tid wait request_delay running locked flushing logging handle_count blocks blocks_logged -+sda5 11122351 0 0 5052 0 0 0 61 34 13 -+sda3 13641140 0 0 5280 0 0 0 138 19 20 -+sda5 11122352 0 0 5041 0 0 0 59 35 14 -+sda3 13641141 0 0 5000 0 0 0 113 9 10 -+sda5 11122353 0 0 5069 0 0 0 91 47 18 -+ -+Example 2: -+# inspect specified device /dev/sda5 -+python alijbd2stat.py -d sda5 -+ -+dev tid wait request_delay running locked flushing logging handle_count blocks blocks_logged -+sda5 11122461 0 0 5029 0 0 0 75 38 15 -+sda5 11122462 0 0 5107 0 0 0 74 42 17 -+sda5 11122463 0 0 5028 0 0 1 88 50 17 -+sda5 11122464 0 0 5128 0 0 0 71 40 19 -+sda5 11122465 0 0 5243 0 0 0 83 44 13 -+sda5 11122466 0 0 5176 0 0 0 90 63 20 -+sda5 11122467 0 0 5029 0 0 0 60 34 13 -+sda5 11122468 0 0 5028 0 0 1 79 38 19 -+sda5 11122469 0 0 5030 0 0 0 44 27 12 -+sda5 11122470 0 0 5614 0 0 0 106 62 25 -+ -+Example 3: -+# inspect specified device /dev/sda5, and show jbd2 handle's context when one -+# jbd2 handle runs more than 1ms. -+python alijbd2stat.py -d sda5 -t 1 -+ -+comm: dd pid: 106212 tid: 11151230 pre_start_time: 0 run_time: 585432 sched_delay: 0 -+ jbd2_journal_stop_[k] -+ jbd2_journal_stop_[k] -+ ext4_dirty_inode_[k] -+ __mark_inode_dirty_[k] -+ generic_write_end_[k] -+ ext4_da_write_end_[k] -+ generic_perform_write_[k] -+ __generic_file_write_iter_[k] -+ ext4_file_write_iter_[k] -+ __vfs_write_[k] -+ vfs_write_[k] -+ sys_write_[k] -+ do_syscall_64_[k] -+ entry_SYSCALL_64_after_swapgs_[k] -+comm: kworker/u128:3 pid: 83229 tid: 0 pre_start_time: 0 run_time: 589686 sched_delay: 0 -+ jbd2_journal_stop_[k] -+ jbd2_journal_stop_[k] -+ ext4_convert_unwritten_extents_[k] -+ ext4_end_io_rsv_work_[k] -+ process_one_work_[k] -+ worker_thread_[k] -+ kthread_[k] -+ ret_from_fork_[k] -+sda5 11151229 0 0 5000 0 0 595 401233 255 36 -diff --git a/tools/alimutexsnoop.py b/tools/alimutexsnoop.py -new file mode 100755 -index 0000000..d28ce05 ---- /dev/null -+++ b/tools/alimutexsnoop.py -@@ -0,0 +1,273 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+from __future__ import print_function -+ -+import argparse -+import ctypes as ct -+import os -+import platform -+import re -+import sys -+import time -+ -+from bcc import BPF -+from bcc.utils import printb -+from datetime import datetime -+from time import strftime -+ -+# -+# alimutexsnoop.py Trace all process that holds a mutex longer than a -+# specified time interval (500us, etc). -+# For Linux, uses BCC, eBPF. Embedded C. -+# -+# USAGE: alimutexsnoop [-h] [-t] [-p PID] -+# -+# Author: Alvin Zheng -+_examples = """examples: -+ alimutexsnoop # trace all mutexes that are held for a long time -+ alimutexsnoop -t 50 # set the time threshold (us), mutexes are held longer -+ than this time threshold will be printed -+ alimutexsnoop -p 123 # only trace the specified process -+""" -+""" -+ Copyright (c) 2019 Alvin Zheng, Alibaba, Inc. -+ Licensed under the Apache License, Version 2.0 (the "License") -+""" -+ -+def positive_int(val): -+ try: -+ ival = int(val) -+ except ValueError: -+ raise argparse.ArgumentTypeError("must be an integer") -+ if ival <= 0: -+ raise argparse.ArgumentTypeError("must be positive value") -+ return ival -+ -+def _getParser(): -+ parser = argparse.ArgumentParser( -+ description="Trace all mutexes that have been held for a long time", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=_examples) -+ a = parser.add_argument -+ a("-t", "--threshold", type=positive_int, default=1000, -+ help="The mutexes held longer than this time threshold will be traced(us)") -+ a("-p", "--pid", type=positive_int, -+ help="trace this PID only") -+ return parser.parse_args -+ -+ -+class Global(): -+ parse_args = _getParser() -+ args = None -+ argv = None -+ bpf = None -+ -+class Data(ct.Structure): -+ """Event data matching struct data_t in _embedded_c().""" -+ _TASK_COMM_LEN = 16 # linux/sched.h -+ _pack_ = 1 -+ _fields_ = [ -+ ("start_time", ct.c_ulonglong), # task->start_time -+ ("pid_tgid", ct.c_ulonglong), -+ ("pid", ct.c_uint), # task->tgid, thread group id == sys_getpid() -+ ("mutex_addr", ct.c_ulonglong), -+ ("k_stack_id", ct.c_int), -+ ("u_stack_id", ct.c_int), -+ ("task", ct.c_char * _TASK_COMM_LEN) -+ ] -+ -+def _embedded_c(args): -+ """Generate C programs for mutex_lock and mutex_unlock.""" -+ c = """ -+ EBPF_COMMENT -+ #include -+ BPF_STATIC_ASSERT_DEF -+ BPF_STACK_TRACE(stack_traces,655360); -+ struct key_t{ -+ u64 pid_tgid; -+ u64 mutex_addr; -+ } __attribute__((packed)); -+ BPF_HASH(held_mutexes, struct key_t,u64); -+ struct data_t{ -+ u64 start_time; -+ u64 pid_tgid; -+ u32 pid; -+ u64 mutex_addr; -+ u32 k_stack_id; -+ u32 u_stack_id; -+ char task[TASK_COMM_LEN]; -+ } __attribute__((packed)); -+ BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA); -+ BPF_PERF_OUTPUT(results); -+ -+ int kprobe_ret_mutex_lock(struct pt_regs * ctx,struct mutex * lock) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ FILTER_SELF_LOCK -+ if (FILTER_PID) { return 0; } -+ struct key_t key = { -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ u64 locktime = bpf_ktime_get_ns(); -+ u64 * ptr=held_mutexes.lookup_or_init(&key, &locktime); -+ if(!ptr){ -+ bpf_trace_printk("could not add held_mutexes, thread: %d, mutex: %p \\n" ,task->tgid,lock); -+ return 1; -+ } -+ return 0; -+ } -+ int kprobe_mutex_unlock(struct pt_regs * ctx, struct mutex * lock) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ FILTER_SELF_UNLOCK -+ if (FILTER_PID) { return 0; } -+ struct key_t key = { -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ u64 cur = bpf_ktime_get_ns(); -+ u64 * locktime=held_mutexes.lookup(&key); -+ if(!locktime){ -+ bpf_trace_printk("cannot find the lock in thread: %d mutex: %p \\n",task->tgid,lock); -+ return 1; -+ } -+ if(cur - (*locktime) > TIME_THRESHOLD){ -+ struct data_t data = { -+ .start_time=task->start_time, -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .pid=task->pid, -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ bpf_get_current_comm(&data.task,sizeof(data.task)); -+ data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID); -+ data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID|BPF_F_USER_STACK); -+ results.perf_submit(ctx,&data,sizeof(data)); -+ } -+ held_mutexes.delete(&key); -+ return 0; -+ } -+ """ -+ # TODO: this macro belongs in bcc/src/cc/export/helpers.h -+ bpf_static_assert_def = r""" -+ #ifndef BPF_STATIC_ASSERT -+ #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \ -+ extern int bpf_static_assert[(condition) ? 1 : -1] -+ #endif -+ """ -+ selfpid = os.getpid() -+ filter_self = r""" -+ if(task->tgid==%d){ -+ return 0; -+ } -+ """ % selfpid -+ code_substitutions = [ -+ ('EBPF_COMMENT', _ebpf_comment()), -+ ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def), -+ ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))), -+ ("TIME_THRESHOLD", str(Global.args.threshold * 1000)), # ns->us -+ ("FILTER_SELF_LOCK", filter_self), -+ ("FILTER_SELF_UNLOCK", filter_self), -+ ('FILTER_PID', '0' if not Global.args.pid else "task->pid != %d" % Global.args.pid) -+ ] -+ for old, new in code_substitutions: -+ c = c.replace(old, new) -+ # print(c) -+ return c -+ -+def _ebpf_comment(): -+ """Return a C-style comment with information about the generated code.""" -+ comment = ('Created by %s at %s:\n\t%s' % -+ (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__)) -+ args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n') -+ return ("\n /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" % -+ (comment, ' '.join(Global.argv), args)) -+ .replace('\n', '\n\t*').replace('\t', ' ')) -+ -+def _print_header(): -+ print("%-16s %-6s %-16s \n" % ("PCOMM", "PID", "MUTEX_ADDR")) -+ -+def _print_event(cpu, data, size): # callback -+ """Print the long-held mutexes.""" -+ stack_traces = Global.bpf['stack_traces'] -+ e = ct.cast(data, ct.POINTER(Data)).contents -+ print("%-16s %-6d %-16x\n" % -+ (e.task.decode(), e.pid, e.mutex_addr), end="") -+ # print the kernel stack -+ if e.k_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.k_stack_id))) -+ for addr in cur_stack: -+ printb(b"Kernel Stack: %-16x %s" % (addr, Global.bpf.ksym(addr))) -+ else: -+ print("No KERNEL STACK FOUND") -+ # print the user stack -+ if e.u_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.u_stack_id))) -+ for addr in cur_stack: -+ printb(b"USER Stack: %-16x %s" % (addr, Global.bpf.sym(addr, e.pid_tgid))) -+ else: -+ print("No USER STACK FOUND") -+ print("=================================") -+ print() -+# ============================= -+# Module: These functions are available for import -+# ============================= -+def initialize(arg_list=sys.argv[1:]): -+ """Trace all long-held mutexes -+ -+ arg_list - list of args, if omitted then uses command line args -+ arg_list is passed to argparse.ArgumentParser.parse_args() -+ -+ Returns a tuple (return_code, result) -+ 0 = Ok, result is the return value from BPF() -+ 1 = args.ebpf is requested, result is the generated C code -+ os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message -+ os.EX_SOFTWARE: internal software error, result is error message -+ """ -+ Global.argv = arg_list -+ Global.args = Global.parse_args(arg_list) -+ if os.geteuid() != 0: -+ return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call") -+ c = _embedded_c(Global.args) -+ try: -+ return (os.EX_OK, BPF(text=c)) -+ except Exception as e: -+ error = format(e) -+ return (os.EX_SOFTWARE, "BPF error: " + error) -+ -+def snoop(bpf, event_handler): -+ """Call event_handler when release a lock that is held for a long time -+ bpf - result returned by successful initialize() -+ event_handler - callback function to handle termination event -+ args.pid - Return after event_handler is called, only monitoring this pid -+ """ -+ bpf["results"].open_perf_buffer(event_handler) -+ while True: -+ bpf.perf_buffer_poll() -+ if Global.args.pid: -+ if not os.path.exists('/proc/%d' % Global.args.pid): -+ # the target process has exited -+ break -+ -+# ============================= -+# Script: invoked as a script -+# ============================= -+def main(): -+ try: -+ rc, buffer = initialize() -+ if rc != os.EX_OK: -+ print(buffer) -+ sys.exit(-1) -+ Global.bpf = buffer -+ Global.bpf.attach_kretprobe(event="mutex_lock", fn_name="kprobe_ret_mutex_lock") -+ Global.bpf.attach_kprobe(event="mutex_unlock", fn_name="kprobe_mutex_unlock") -+ _print_header() -+ snoop(buffer, _print_event) -+ except KeyboardInterrupt: -+ print() -+ sys.exit() -+ -+ return 0 -+ -+if __name__ == '__main__': -+ main() -diff --git a/tools/alimutexsnoop_example.txt b/tools/alimutexsnoop_example.txt -new file mode 100644 -index 0000000..67ca03e ---- /dev/null -+++ b/tools/alimutexsnoop_example.txt -@@ -0,0 +1,79 @@ -+Demonstrations of alimutexsnoop. -+ -+This profile tool monitors the mutex that is held by the process longer than -+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes, -+it prints the pid of the process that held the mutex and the address of the -+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the -+process. -+ -+ -+Usage: -+alimutexsnoop.py [-h] [-t THRESHOLD] [-p PID] -+ -+Trace all mutexes that have been held for a long time -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -t THRESHOLD, --threshold THRESHOLD -+ The mutexes held longer than this time threshold will -+ be traced(us) -+ -p PID, --pid PID trace this PID only -+ -+examples: -+ alimutexsnoop # trace all mutexes that are held for a long time -+ alimutexsnoop -t 50 # set the time threshold (us), mutexes are held longer -+ than this time threshold will be printed -+ alimutexsnoop -p 123 # only trace the specified process -+ -+ -+Output example: -+ -+python alimutexsnoop.py -t 20 # monitor the mutexes that are held by processes longer than 20 us -+ -+PCOMM PID MUTEX_ADDR -+ -+AliYunDun 4283 ffffa30fd1b4ea48 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDun 4283 ffffa310d7e43c40 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316ba ksys_read -+Kernel Stack: ffffffffa5231149 vfs_read -+Kernel Stack: ffffffffa5230f86 __vfs_read -+Kernel Stack: ffffffffa5254b30 seq_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDun 4283 ffffa30fd1b4e348 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDunUpdate 4256 ffffa310d1146a40 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316ba ksys_read -+Kernel Stack: ffffffffa5231149 vfs_read -+Kernel Stack: ffffffffa5230f86 __vfs_read -+Kernel Stack: ffffffffa5254b30 seq_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7fbce69966fd read -+================================= -+ -+AliYunDunUpdate 4256 ffffa310d3b48a48 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7fbce69966fd read -+================================= -diff --git a/tools/alirunqinfo.py b/tools/alirunqinfo.py -new file mode 100755 -index 0000000..102c763 ---- /dev/null -+++ b/tools/alirunqinfo.py -@@ -0,0 +1,360 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alirunqinfo Summarize task on-CPU time before being descheduled. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alirunqinfo [-h] [-T] [-N] [-D] [-C] [-P] [-c CPU] [-p PID] -+# [interval] [outputs] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+import ctypes -+import signal -+ -+# arguments -+examples = """examples: -+ ./alirunqinfo # sum runqueue statistics -+ ./alirunqinfo -C # show per-CPU runqueue statistics -+ ./alirunqinfo -P # show per-thread runqueue statistics -+ ./alirunqinfo -D # show runqueue statistics as histograms -+ ./alirunqinfo -c 0 # show runqueue statistics of CPU 0 only -+ ./alirunqinfo -p 25 # show runqueue statistics of PID 25 only -+ ./alirunqinfo 1 10 # print 1 second summaries, 10 times -+ ./alirunqinfo -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize runqueue statistics.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-C", "--cpus", action="store_true", -+ help="show per-CPU runqueue statistics") -+parser.add_argument("-P", "--pids", action="store_true", -+ help="show per-thread runqueue statistics") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="show runqueue statistics on specific CPU only") -+parser.add_argument("-p", "--pid", -+ help="show runqueue statistics of specific PID only") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.pid: -+ args.pids = True -+if args.cpu: -+ args.cpus = True -+if (not args.cpus) and (not args.pids): -+ args.cpus = True -+ -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct time_stamp { -+ u64 stamp; -+ u32 pid; -+} time_stamp_t; -+ -+BPF_PERCPU_ARRAY(start, time_stamp_t, 1); -+ -+STORAGE -+ -+ -+static inline int sched_switch_exit(u32 cpu, u32 pid, u64 now, time_stamp_t *new) { -+ PID_FILTER -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ time_stamp_t *stampp = start.lookup(&idx); -+ if (!stampp || stampp->pid != pid) { -+ return 0; // missed start -+ } -+ u64 delta = now - stampp->stamp; -+ start.update(&idx, new); -+ -+ UPDATE -+ -+ return 0; -+} -+ -+static inline int sched_switch_enter(u32 pid, u64 now, time_stamp_t *new) { -+ PID_FILTER -+ -+ u32 idx = 0; -+ new->stamp = now; -+ new->pid = pid; -+ -+ start.update(&idx, new); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(sched, sched_switch) -+{ -+ u32 cpu = bpf_get_smp_processor_id(); -+ CPU_FILTER -+ -+ u64 now = bpf_ktime_get_ns(); -+ u32 pid, idx = 0; -+ time_stamp_t new = {}; -+ -+ pid = args->prev_pid; -+ if (pid != 0) { -+ sched_switch_exit(cpu, pid, now, &new); -+ } -+ -+ pid = args->next_pid; -+ if (pid != 0) { -+ sched_switch_enter(pid, now, &new); -+ } -+ -+ return 0; -+} -+""" -+ -+ -+# replace STORAGE -+map_key = """ -+typedef struct map_key { -+ u32 cpu; -+ u32 pid; -+} map_key_t; -+""" -+ -+map_val = """ -+typedef struct map_val { -+ u64 time; -+ u64 count; -+} map_val_t; -+""" -+ -+if args.dist: -+ # diagram mode, we need BPF_HISTOGRAM -+ if args.cpus and args.pids: -+ storage = map_key + """ -+typedef struct map_key_slot { -+ map_key_t key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ else: -+ storage = """ -+typedef struct map_key_slot { -+ u64 key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ storage += """BPF_HISTOGRAM(dist, map_key_slot_t);""" -+ -+else: -+ # normal mode, we need BPF_HASH -+ storage = map_val -+ if args.cpus and args.pids: -+ storage += map_key + """BPF_HASH(res, map_key_t, map_val_t);""" -+ else: -+ storage += """BPF_HASH(res, u32, map_val_t);""" -+ -+bpf_text = bpf_text.replace('STORAGE', storage) -+ -+ -+# replace UPDATE -+if args.dist: -+ if args.cpus and args.pids: -+ key_exp = "{.cpu = cpu, .pid = pid,}" -+ elif args.cpus: -+ key_exp = "cpu" -+ elif args.pids: -+ key_exp = "pid" -+ -+ update = """ -+ map_key_slot_t key_slot = { -+ .key = %s, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+""" % key_exp -+ -+else: -+ if args.cpus and args.pids: -+ update = """ -+ map_key_t key = { -+ .cpu = cpu, -+ .pid = pid, -+ }; -+""" -+ elif args.cpus: -+ update = "u32 key = cpu;" -+ elif args.pids: -+ update = "u32 key = pid;" -+ -+ update += """ -+ map_val_t *valp = res.lookup(&key); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ } -+ else { -+ map_val_t val = { -+ .count = 1, -+ .time = delta, -+ }; -+ res.update(&key, &val); -+ } -+""" -+bpf_text = bpf_text.replace('UPDATE', update) -+bpf_text = bpf_text.replace('FACTOR', str(factor)) -+ -+ -+# replace FILTER -+cpu_filter = "if (cpu != %s) {return 0;}" % args.cpu if args.cpu else "" -+bpf_text = bpf_text.replace('CPU_FILTER', cpu_filter) -+ -+pid_filter = "if (pid != %s) {return 0;}" % args.pid if args.pid else "" -+bpf_text = bpf_text.replace('PID_FILTER', pid_filter) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+ -+# print header -+print("Tracing runqueue statistics", end="") -+if args.pid: -+ print(" of PID %s" % args.pid, end="") -+if args.cpu: -+ print(" on CPU %s" % args.cpu, end="") -+if args.dist: -+ print(", in diagram mode", end="") -+print("...Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+def pid_to_comm(pid): -+ try: -+ return open("/proc/%d/comm" % pid, "r").read().rstrip("\n") -+ except IOError: -+ return "" -+ -+ -+def print_per_cpu_thread(tab): -+ cur_cpu = -1 -+ # k is struct map_key; v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].pid)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label)) -+ cur_cpu = k.cpu -+ print("%10d %40s %10d %10d" % (k.pid, pid_to_comm(k.pid), v.count, v.time / factor)) -+ -+ -+def print_per_cpu(tab): -+ print("%10s %10s %10s" % ("CPU", "Count", "TOTAL_" + label)) -+ # k is c_uint (representing CPU), v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: res[0].value): -+ print("%10d %10d %10d" % (k.value, v.count, v.time / factor)) -+ -+ -+def print_per_thread(tab): -+ print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label)) -+ # k is c_uint (representing PID), v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: res[0].value): -+ print("%10d %40s %10d %10d" % (k.value, pid_to_comm(k.value), v.count, v.time / factor)) -+ -+ -+def print_dist_per_cpu_thread(tab): -+ tab.print_log2_hist("time_" + label, "section", -+ # k is (cpuN, pidN) formatted by bucket_fn -+ section_print_fn=lambda k: ("CPU %d, PID %d (%s)" % (k[0], k[1], pid_to_comm(k[1]))), -+ bucket_fn=lambda k: (k.cpu, k.pid), # k is struct map_key -+ bucket_sort_fn=lambda k: sorted(k)) # k is struct map_key -+ -+ -+def print_dist_per_cpu(tab): -+ tab.print_log2_hist("time_" + label, "CPU", -+ section_print_fn=lambda k: str(k), # k is u64 representing CPU -+ bucket_sort_fn=lambda k: sorted(k)) # k is u64 representing CPU -+ -+ -+def print_dist_per_thread(tab): -+ tab.print_log2_hist("time_" + label, "Thread", -+ section_print_fn=lambda k: ("%d (%s)" % (k, pid_to_comm(k))), # k is u64 representing PID -+ bucket_sort_fn=lambda k: sorted(k)) # k is u64 representing PID -+ -+ -+# signal handler -+def signal_ignore(signal, frame): -+ print() -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ # as cleanup can take many seconds, trap Ctrl-C: -+ signal.signal(signal.SIGINT, signal_ignore) -+ countdown = 1 -+ -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ if args.cpus and args.pids: -+ print_dist_per_cpu_thread(tab) -+ elif args.cpus: -+ print_dist_per_cpu(tab) -+ elif args.pids: -+ print_dist_per_thread(tab) -+ else: -+ print("Invalid parameter") -+ countdown = 1 -+ else: -+ if args.cpus and args.pids: -+ print_per_cpu_thread(tab) -+ elif args.cpus: -+ print_per_cpu(tab) -+ elif args.pids: -+ print_per_thread(tab) -+ else: -+ print("Invalid parameter") -+ countdown = 1 -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alirunqinfo_example.txt b/tools/alirunqinfo_example.txt -new file mode 100644 -index 0000000..a1c24ca ---- /dev/null -+++ b/tools/alirunqinfo_example.txt -@@ -0,0 +1,203 @@ -+This program summarize runqueue statistics, that is, task on-CPU time -+before being descheduled, in flavor of per-CPU, per-PID, or histogram. -+ -+There are major two categories of parameters: -+1. one category of parameters control the sampling source, such as -+-c[--cpu] and -p[--pid] -+2. another category of parameters control the displaying format, -+such as -C[--cpus], -P[--pids] and -D[--dist] -+ -+These two categories of parameters are Orthogonal. -+ -+ -+Example 1. -+ -+By default, the program will sample all processes on all runqueues -+of all CPUs. Statistics of all processes on the same CPU is -+aggregated and displayed in a per-CPU flavor. -+ -+In this case, "alirunqinfo" is the same as "alirunqinfo -C". -+ -+# alirunqinfo -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs -+ 0 77 207 -+ 1 9 27 -+ 2 3 14 -+ 3 2 15 -+ 4 45 122 -+ 6 41 380 -+ 7 6 25 -+ 9 18 61 -+ 10 19 50 -+ 12 2 19 -+ 14 2 189 -+ 16 1 13 -+ 17 18 49 -+ 20 36 73 -+ 21 18 35 -+ 22 22 109 -+ 24 25 92 -+ 28 21 68 -+ 30 4 95 -+ -+ -+Example 2. -+ -+Make the output displayed in a per-process flavor with "-P" parameter. -+In this case, all processes on all CPUs are sampled as usual, while -+all statistics of the same process is aggregated, whichever CPU it -+runs on. -+ -+# alirunqinfo -P -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 10 rcu_sched 122 264 -+ 382 kworker/14:1-mm_percpu_wq 1 20 -+ 2926 rngd 1 2 -+ 3327 kworker/16:2-mm_percpu_wq 1 15 -+ 3384 in:imjournal 2 8 -+ 3428 AliYunDunUpdate 20 60 -+ 3429 AliYunDunUpdate 20 40 -+ 3442 aliyun-service 20 98 -+ 3449 AliYunDun 20 137 -+ 3450 AliYunDun 19 58 -+ 3451 AliYunDun 20 56 -+ 3462 AliYunDun 19 46 -+ 3463 AliYunDun 19 100 -+ 3464 AliYunDun 2 220 -+ 3467 AliYunDun 2 6 -+ 3468 AliYunDun 26 61 -+ 3469 AliYunDun 39 78 -+ 3470 AliYunDun 39 125 -+ 3471 AliYunDun 2 10 -+ 3473 AliYunDun 2 4 -+ 3474 AliYunDun 1 4 -+ 3475 AliYunDun 2 11 -+ 3476 AliYunDun 10 30 -+ 3533 sshd 5 193 -+ 4573 kworker/27:2-mm_percpu_wq 2 13 -+ 5016 kworker/24:0-mm_percpu_wq 1 5 -+ 5281 kworker/4:0-events_power_efficient 9 19 -+ 5999 kworker/0:2-events_power_efficient 2 34 -+ 6429 kworker/u64:0-events_unbound 3 14 -+ 6468 kworker/u64:1-events_unbound 2 37 -+ -+ -+Example 3. -+ -+You can set "-P" and "-C" parameter at the same time, in which case, the -+output is displayed in a per-CPU and per-process flavor. -+ -+# alirunqinfo -PC -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ PID Thread Count TOTAL_usecs -+ 10 rcu_sched 72 147 -+ 3467 AliYunDun 1 6 -+ -+CPU1: -+ PID Thread Count TOTAL_usecs -+ 3476 AliYunDun 5 26 -+ -+CPU2: -+ PID Thread Count TOTAL_usecs -+ 3384 in:imjournal 2 12 -+ -+CPU3: -+ PID Thread Count TOTAL_usecs -+ 2926 rngd 2 7 -+ -+ -+Example 4. -+ -+Sample specified process only with "-p", and "-P" is added implicitly -+when "-p" is specified. -+ -+# alirunqinfo -p 3429 -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3429 AliYunDunUpdate 14 40 -+ -+ -+Example 5. -+ -+When both "-p" and "-C" parameters are specified, the output is displayed -+in per-CPU and per-process flavor. -+ -+# alirunqinfo -p 3429 -C -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+CPU22: -+ PID Thread Count TOTAL_usecs -+ 3429 AliYunDunUpdate 29 67 -+ -+ -+Example 6. -+ -+Sample processes on specified CPU only with "-c", and "-C" is added -+implicitly when "-c" is specified. -+ -+# alirunqinfo -c 22 -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs -+ 22 24 90 -+ -+ -+Example 7. -+ -+Make the output displayed in a diagram when "-D" is specified. -+ -+# alirunqinfo -p 3429 -D -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+Thread = 3429 (AliYunDunUpdate) -+ time_usecs : count distribution -+ 0 -> 1 : 22 |****************************************| -+ 2 -> 3 : 12 |********************* | -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 0 | | -+ 16 -> 31 : 0 | | -+ 32 -> 63 : 1 |* | -+ -+ -+ -+USAGE message: -+ -+# alirunqinfo -h -+usage: alirunqinfo.py [-h] [-T] [-N] [-C] [-P] [-D] [-c CPU] [-p PID] -+ [interval] [outputs] -+ -+Summarize runqueue statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -C, --cpus show per-CPU runqueue statistics -+ -P, --pids show per-thread runqueue statistics -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU show runqueue statistics on specific CPU only -+ -p PID, --pid PID show runqueue statistics of specific PID only -+ -+examples: -+ ./alirunqinfo # sum runqueue statistics -+ ./alirunqinfo -C # show per-CPU runqueue statistics -+ ./alirunqinfo -P # show per-thread runqueue statistics -+ ./alirunqinfo -D # show runqueue statistics as histograms -+ ./alirunqinfo -c 0 # show runqueue statistics of CPU 0 only -+ ./alirunqinfo -p 25 # show runqueue statistics of PID 25 only -+ ./alirunqinfo 1 10 # print 1 second summaries, 10 times -+ ./alirunqinfo -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisoftirqs.py b/tools/alisoftirqs.py -new file mode 100755 -index 0000000..202114f ---- /dev/null -+++ b/tools/alisoftirqs.py -@@ -0,0 +1,215 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisoftirqs Summarize soft IRQ (interrupt) event time. -+# Enhanced version of BCC softirqs. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisoftirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alisoftirqs # sum soft irq event time -+ ./alisoftirqs -D # show soft irq event time as histograms -+ ./alisoftirqs -c 0 # show soft irq event time of CPU 0 only -+ ./alisoftirqs -i 25 # show event time of IRQ 25 only -+ ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alisoftirqs 1 10 # print 1 second summaries, 10 times -+ ./alisoftirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize soft irq statistics, including total count, time, etc.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="output softIRQ statistics on specific CPU only") -+parser.add_argument("-i", "--irq", -+ help="output statistics of specific softIRQ") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+distmode = "1" if args.dist else "0" -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct irq_key { -+ u32 cpu; -+ u32 irq; -+} irq_key_t; -+ -+typedef struct irq_key_slot { -+ irq_key_t key; -+ u64 slot; -+} irq_key_slot_t; -+ -+typedef struct irq_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} irq_val_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_HASH(res, irq_key_t, irq_val_t); -+BPF_HISTOGRAM(dist, irq_key_slot_t); -+ -+ -+TRACEPOINT_PROBE(irq, softirq_entry) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(irq, softirq_exit) -+{ -+ u64 *tsp, delta; -+ u32 cpu = bpf_get_smp_processor_id(); -+ u32 irq = args->vec; -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ irq_key_t key = {.cpu = cpu, .irq = irq}; -+ -+#if DIST_MODE -+ irq_key_slot_t key_slot = { -+ .key = key, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+#else -+ struct irq_val *valp, val; -+ valp = res.lookup(&key); -+ -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+#endif -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_filter = '' -+cpu_filter = "1" -+if args.cpu: -+ cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+irq_filter = "1" -+if args.irq: -+ irq_filter = "args->vec == %s" % args.irq -+ -+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter) -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('FACTOR', "%d" % factor) -+bpf_text = bpf_text.replace('DIST_MODE', distmode) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+def vec_to_name(vec): -+ # copied from softirq_to_name() in kernel/softirq.c -+ # may need updates if new softirq handlers are added -+ return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", -+ "tasklet", "sched", "hrtimer", "rcu"][vec] -+ -+def print_section(key): -+ return "CPU %d, softIRQ %d (%s)" % (key[0], key[1], vec_to_name(key[1])) -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing soft irq event time... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ tab.print_log2_hist("time_" + label, "section", -+ section_print_fn=print_section, -+ bucket_fn=lambda k: (k.cpu, k.irq), -+ bucket_sort_fn=lambda keys: sorted(keys)) -+ else: -+ cur_cpu = -1 -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %20s %10s %10s %10s" % -+ ("SOFTIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label)) -+ cur_cpu = k.cpu -+ -+ print("%10d %20s %10d %10d %10d" % -+ (k.irq, vec_to_name(k.irq), v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisoftirqs_example.txt b/tools/alisoftirqs_example.txt -new file mode 100644 -index 0000000..48e0023 ---- /dev/null -+++ b/tools/alisoftirqs_example.txt -@@ -0,0 +1,128 @@ -+Demonstrations of alisoftirqs, the enhanced version of BCC softirqs. -+ -+This program traces soft interrupts (irqs), and stores timing statistics -+in-kernel for efficiency. -+ -+ -+Example 1 -+ -+By default, all statistics of all softirq on all CPUs are sampled, -+and the output is displayed in a per-softirq and per-CPU flavor. -+ -+# alisoftirqs -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 7 sched 1 59 59 -+ -+CPU3: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 1 0 0 -+ 3 net_rx 6 29 9 -+ 7 sched 1 1 1 -+ 9 rcu 1 0 0 -+ -+CPU4: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 29 11 1 -+ 7 sched 11 26 4 -+ 9 rcu 19 22 5 -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-softirq flavor. -+ -+# alisoftirqs -c 6 -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU6: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 7 2 0 -+ 7 sched 2 9 8 -+ 9 rcu 5 10 5 -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified softirq only with "-i" option, -+and the output will be displayed in a per-CPU flavor. -+ -+# alisoftirqs -i 1 -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 1 2 2 -+ -+CPU4: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 26 11 2 -+ -+CPU7: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 2 5 4 -+ -+ -+ -+Example 4 -+ -+The distribution of softirq run time can be printed as a histogram with "-D" -+option. The output will be displayed in a per-softirq and per-CPU flavor. -+ -+# alisoftirqs -D -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+section = CPU 3, softIRQ 3 (net_rx) -+ time_usecs : count distribution -+ 0 -> 1 : 0 | | -+ 2 -> 3 : 2 |****************************************| -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 1 |******************** | -+ -+section = CPU 4, softIRQ 1 (timer) -+ time_usecs : count distribution -+ 0 -> 1 : 13 |****************************************| -+ -+section = CPU 4, softIRQ 7 (sched) -+ time_usecs : count distribution -+ 0 -> 1 : 2 |****************************************| -+ 2 -> 3 : 1 |******************** | -+ 4 -> 7 : 2 |****************************************| -+ -+ -+ -+USAGE message: -+ -+# alisoftirqs -h -+usage: alisoftirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] -+ [interval] [outputs] -+ -+Summarize soft irq statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU output softIRQ statistics on specific CPU only -+ -i IRQ, --irq IRQ output statistics of specific softIRQ -+ -+examples: -+ ./alisoftirqs # sum soft irq event time -+ ./alisoftirqs -D # show soft irq event time as histograms -+ ./alisoftirqs -c 0 # show soft irq event time of CPU 0 only -+ ./alisoftirqs -i 25 # show event time of IRQ 25 only -+ ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alisoftirqs 1 10 # print 1 second summaries, 10 times -+ ./alisoftirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisyscount.py b/tools/alisyscount.py -new file mode 100755 -index 0000000..921fac7 ---- /dev/null -+++ b/tools/alisyscount.py -@@ -0,0 +1,329 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisyscount Summarize syscall counts and latencies. -+# Enhanced version of BCC syscount. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisyscount [-h] [-T] [-N] [-D] [-S] [-P] [-s syscall] [-p pid] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+import ctypes -+from bcc.syscall import syscall_name, syscalls -+ -+# arguments -+examples = """examples: -+ ./alisyscount # sum syscall counts and latencies. -+ ./alisyscount -S # show per-syscall statistics -+ ./alisyscount -P # show per-thread syscall statistics -+ ./alisyscount -D # show syscall statistics as histograms -+ ./alisyscount -s 0 # show syscall statistics of syscall 0 only -+ ./alisyscount -p 25 # show syscall statistics of PID 25 only -+ ./alisyscount 1 10 # print 1 second summaries, 10 times -+ ./alisyscount -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize syscall counts and latencies.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-S", "--syscalls", action="store_true", -+ help="show per-syscall statistics") -+parser.add_argument("-P", "--pids", action="store_true", -+ help="show per-thread syscall statistics") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-s", "--syscall", -+ help="show statistics of specific syscall only") -+parser.add_argument("-p", "--pid", -+ help="show syscall statistics of specific PID only") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.pid: -+ args.pids = True -+if args.syscall: -+ args.syscalls = True -+if (not args.syscalls) and (not args.pids): -+ args.syscalls = True -+ -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+ -+STORAGE -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_enter) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_exit) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 *tsp, delta; -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ u32 sysnr = args->id; -+ u32 pid = bpf_get_current_pid_tgid(); -+ -+ UPDATE -+ -+ return 0; -+} -+""" -+ -+ -+# replace STORAGE -+map_key = """ -+typedef struct map_key { -+ u32 sysnr; -+ u32 pid; -+} map_key_t; -+""" -+ -+map_val = """ -+typedef struct map_val { -+ u64 time; -+ u64 count; -+} map_val_t; -+""" -+ -+if args.dist: -+ if args.syscalls and args.pids: -+ storage = map_key + """ -+typedef struct map_key_slot { -+ map_key_t key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ else: -+ storage = """ -+typedef struct map_key_slot { -+ u64 key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ storage += "BPF_HISTOGRAM(dist, map_key_slot_t);" -+ -+else: -+ storage = map_val -+ if args.syscalls and args.pids: -+ storage += map_key + "BPF_HASH(res, map_key_t, map_val_t);" -+ else: -+ storage += "BPF_HASH(res, u32, map_val_t);" -+bpf_text = bpf_text.replace('STORAGE', storage) -+ -+ -+# replace UPDATE -+if args.dist: -+ if args.syscalls and args.pids: -+ key_exp = "{.sysnr = sysnr, .pid = pid,}" -+ elif args.syscalls: -+ key_exp = "sysnr" -+ elif args.pids: -+ key_exp = "pid" -+ -+ update = """ -+ map_key_slot_t key_slot = { -+ .key = %s, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+""" % key_exp -+ -+else: -+ if args.syscalls and args.pids: -+ update = """ -+ map_key_t key = { -+ .sysnr = sysnr, -+ .pid = pid, -+ }; -+""" -+ elif args.syscalls: -+ update = "u32 key = sysnr;" -+ elif args.pids: -+ update = "u32 key = pid;" -+ -+ update += """ -+ map_val_t val = {}; -+ map_val_t *valp = res.lookup_or_init(&key, &val); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ } -+""" -+bpf_text = bpf_text.replace('UPDATE', update) -+bpf_text = bpf_text.replace('FACTOR', str(factor)) -+ -+ -+# replace FILTER -+sysnr_match = "args->id == %s" % args.syscall -+pid_match = "((u32)bpf_get_current_pid_tgid()) == %s" % args.pid -+ -+if args.syscall and args.pid: -+ cond_match = "(%s) && (%s)" % (sysnr_match, pid_match) -+elif args.syscall: -+ cond_match = sysnr_match -+elif args.pid: -+ cond_match = pid_match -+else: -+ cond_match = "" -+ -+cond_filter = "if (!(%s)) {return 0;}" % cond_match if cond_match else "" -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing syscall statistics... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+def pid_to_comm(pid): -+ try: -+ return open("/proc/%d/comm" % pid, "r").read().rstrip("\n") -+ except IOError: -+ return "" -+ -+ -+def get_pid_info(pid): -+ return "%10d %30s" % (pid, pid_to_comm(pid)) -+ -+def get_syscall_info(sysnr): -+ return "%10d %30s" % (sysnr, syscall_name(sysnr)) -+ -+def skip_errata(k): -+ if args.syscalls and args.pids: -+ sysnr = k.sysnr -+ else: # args.syscalls or args.pids -+ sysnr = k.value -+ -+ # happens occasionally, we don't need it -+ if sysnr == 0xFFFFFFFF: -+ return True -+ else: -+ return False -+ -+ -+def print_stat(tab): -+ if args.syscalls and args.pids: -+ section_hdr = "%10s %30s %10s %30s" % ("SysNR", "Syscall", "PID", "Thread") -+ elif args.pids: -+ section_hdr = "%10s %30s" % ("PID", "Thread") -+ else: # args.syscalls -+ section_hdr = "%10s %30s" % ("SysNR", "Syscall") -+ -+ section_hdr += " %10s %10s" % ("Count", "TOTAL_" + label) -+ print(section_hdr) -+ -+ # k is struct map_key; v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: (-res[1].count, -res[1].time)): -+ if skip_errata(k): -+ continue -+ -+ if args.syscalls and args.pids: -+ record = "%40s %40s" % (get_syscall_info(k.sysnr), get_pid_info(k.pid)) -+ elif args.pids: -+ record = "%40s" % (get_pid_info(k.value)) -+ else: # args.syscalls -+ record = "%40s" % (get_syscall_info(k.value)) -+ -+ record += "%10d %10d" % (v.count, v.time / factor) -+ print(record) -+ -+ -+def get_dist_pid_info(pid): -+ return "PID %d (%s)" % (pid, pid_to_comm(pid)) -+ -+def get_dist_syscall_info(sysnr): -+ return "Syscall %d (%s)" % (sysnr, syscall_name(sysnr)) -+ -+def print_dist_section(k): -+ if args.syscalls and args.pids: -+ return "%s %s" % (get_dist_syscall_info(k[0]), get_dist_pid_info(k[1])) -+ elif args.pids: -+ return "%40s" % (get_dist_pid_info(k)) -+ else: -+ return "%40s" % (get_dist_syscall_info(k)) -+ -+def print_dist_stat(tab): -+ section = "section" if args.syscalls and args.pids else "Thread" if args.pids else "Syscall" -+ tab.print_log2_hist("time_" + label, section, -+ section_print_fn=print_dist_section, -+ bucket_fn=(lambda k: (k.sysnr, k.pid)) if args.syscalls and args.pids else None, -+ bucket_sort_fn=lambda k: sorted(k)) -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ print_dist_stat(tab) -+ else: -+ print_stat(tab) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisyscount_example.txt b/tools/alisyscount_example.txt -new file mode 100644 -index 0000000..935f2ff ---- /dev/null -+++ b/tools/alisyscount_example.txt -@@ -0,0 +1,199 @@ -+This program summarize syscall counts and latencies, in flavor of -+per-CPU, per-PID, or histogram. -+ -+There are major two categories of parameters: -+1. one category of parameters control the sampling source, such as -+-s[--syscall] and -p[--pid] -+2. another category of parameters control the displaying format, -+such as -S[--syscalls], -P[--pids] and -D[--dist] -+ -+These two categories of parameters are Orthogonal. -+ -+ -+Example 1. -+ -+By default, the program will sample all syscalls of all processes. -+Statistics of all processes on the same syscall is aggregated and -+displayed in a per-syscall flavor. -+ -+In this case, "alisyscount" is the same as "alisyscount -S". -+ -+# alisyscount -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall Count TOTAL_usecs -+ 35 nanosleep 246 23815620 -+ 4 stat 207 306 -+ 0 read 61 65 -+ 321 bpf 43 112 -+ 3 close 28 23 -+ 14 rt_sigprocmask 20 5 -+ 232 epoll_wait 20 5 -+ 162 sync 19 1901216 -+ 228 clock_gettime 18 5 -+ 23 select 9 4016487 -+ 6 lstat 9 16 -+ 1 write 8 136 -+ 2 open 8 19 -+ 72 fcntl 6 1 -+ 5 fstat 4 2 -+ 16 ioctl 3 3 -+ 271 ppoll 1 900954 -+ 78 getdents 1 355 -+ 89 readlink 1 4 -+ 257 openat 1 2 -+ 9 mmap 1 2 -+ 100 times 1 2 -+ 13 rt_sigaction 1 0 -+ -+ -+Example 2. -+ -+Make the output displayed in a per-process flavor with "-P" parameter. -+In this case, all syscalls of all processe are sampled as usual, while -+all statistics of the same process is aggregated, whatever syscall it -+runs. -+ -+# alisyscount -P -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3550 AliYunDun 125 3103841 -+ 3529 AliYunDun 71 3101981 -+ 3549 AliYunDun 62 3103861 -+ 3543 AliYunDun 61 3001874 -+ 3493 aliyun-service 61 3001874 -+ 3564 sshd 48 3149751 -+ 3824 alisyscount.py 46 3149398 -+ 3548 AliYunDun 40 3002530 -+ 3544 AliYunDun 32 2000233 -+ 3530 AliYunDun 31 3101971 -+ 3556 AliYunDun 31 3001000 -+ 3542 AliYunDun 30 3001918 -+ 3531 AliYunDun 30 3001871 -+ 3423 AliYunDunUpdate 30 3001867 -+ 3424 AliYunDunUpdate 30 3001853 -+ 3475 in:imjournal 5 1801873 -+ 3551 AliYunDun 3 1000111 -+ 3422 AliYunDunUpdate 3 1 -+ 3553 AliYunDun 2 2000172 -+ 3547 AliYunDun 2 2000172 -+ 3555 AliYunDun 2 2000162 -+ -+ -+Example 3. -+ -+You can set "-P" and "-S" parameter at the same time, in which case, the -+output is displayed in a per-syscall and per-process flavor. -+ -+# alisyscount -P -S -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall PID Thread Count TOTAL_usecs -+ 4 stat 3550 AliYunDun 586 855 -+ 35 nanosleep 3549 AliYunDun 40 2002500 -+ 0 read 3550 AliYunDun 36 90 -+ 35 nanosleep 3548 AliYunDun 26 1951607 -+ 321 bpf 3832 alisyscount.py 23 36 -+ 35 nanosleep 3550 AliYunDun 22 1901390 -+ 35 nanosleep 3530 -+ -+ -+Example 4. -+ -+Sample specified process only with "-p", and "-P" is added implicitly -+when "-p" is specified. -+ -+# alisyscount -p 3550 -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3550 AliYunDun 121 3003773 -+ -+ -+Example 5. -+ -+When both "-p" and "-S" parameters are specified, the output is displayed -+in per-syscall and per-process flavor. -+ -+# alisyscount -p 3550 -S -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall PID Thread Count TOTAL_usecs -+ 0 read 3550 AliYunDun 42 16 -+ 35 nanosleep 3550 AliYunDun 41 2052562 -+ -+ -+Example 6. -+ -+Sample specified syscall only with "-s", and "-S" is added -+implicitly when "-s" is specified. -+ -+# alisyscount -s 3 -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall Count TOTAL_usecs -+ 3 close 36 15 -+ -+ -+Example 7. -+ -+Make the output displayed in a diagram when "-D" is specified. -+ -+# alisyscount -p 3550 -D -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ -+Thread = PID 3550 (AliYunDun) -+ time_usecs : count distribution -+ 0 -> 1 : 33 |****************************************| -+ 2 -> 3 : 0 | | -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 0 | | -+ 16 -> 31 : 0 | | -+ 32 -> 63 : 0 | | -+ 64 -> 127 : 0 | | -+ 128 -> 255 : 0 | | -+ 256 -> 511 : 0 | | -+ 512 -> 1023 : 0 | | -+ 1024 -> 2047 : 0 | | -+ 2048 -> 4095 : 0 | | -+ 4096 -> 8191 : 0 | | -+ 8192 -> 16383 : 0 | | -+ 16384 -> 32767 : 0 | | -+ 32768 -> 65535 : 32 |************************************** | -+ -+ -+ -+USAGE message: -+ -+# alisyscount -h -+usage: alisyscount.py [-h] [-T] [-N] [-S] [-P] [-D] [-s SYSCALL] [-p PID] -+ [interval] [outputs] -+ -+Summarize syscall counts and latencies. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -S, --syscalls show per-syscall statistics -+ -P, --pids show per-thread syscall statistics -+ -D, --dist show distributions as histograms -+ -s SYSCALL, --syscall SYSCALL -+ show statistics of specific syscall only -+ -p PID, --pid PID show syscall statistics of specific PID only -+ -+examples: -+ ./alisyscount # sum syscall counts and latencies. -+ ./alisyscount -S # show per-syscall statistics -+ ./alisyscount -P # show per-thread syscall statistics -+ ./alisyscount -D # show syscall statistics as histograms -+ ./alisyscount -s 0 # show syscall statistics of syscall 0 only -+ ./alisyscount -p 25 # show syscall statistics of PID 25 only -+ ./alisyscount 1 10 # print 1 second summaries, 10 times -+ ./alisyscount -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisysdelay.py b/tools/alisysdelay.py -new file mode 100755 -index 0000000..e4c7952 ---- /dev/null -+++ b/tools/alisysdelay.py -@@ -0,0 +1,297 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisysdelay Catch tasks hung in kernel mode and get its stack -+# For Linux, uses BCC, BPF, perf_events. Embedded C. -+# -+# In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+# may be unacceptable, because some tasks may run too long in kernel mode -+# without calling cond_sched(). These tasks may be kernel thread, or user -+# processes that enter kernel mode by syscall. -+# -+# alisysdelay can catch tasks that have been running continuously for a long time -+# whithout scheduling, and kernel/user stackes of these tasks will be printed. -+# The threshold can be user defined. -+# -+# By default CPU idle stacks are excluded by simply excluding PID 0. -+# -+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). -+# -+# USAGE: alisysdelay [-h] [-d] [-a] [-f frequency] [-t threshold] [--stack-storage-size size] -+# -+# Copyright (c) 2016 Brendan Gregg, Netflix, Inc. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF, PerfType, PerfSWConfig -+import argparse -+import errno -+import ctypes as ct -+ -+ -+# arg validation -+def positive_int(val): -+ try: -+ ival = int(val) -+ except ValueError: -+ raise argparse.ArgumentTypeError("must be an integer") -+ -+ if ival < 0: -+ raise argparse.ArgumentTypeError("must be positive") -+ return ival -+ -+def positive_nonzero_int(val): -+ ival = positive_int(val) -+ if ival == 0: -+ raise argparse.ArgumentTypeError("must be nonzero") -+ return ival -+ -+def stack_id_err(stack_id): -+ # -EFAULT in get_stackid normally means the stack-trace is not availible, -+ # Such as getting kernel stack trace in userspace code -+ return (stack_id < 0) and (stack_id != -errno.EFAULT) -+ -+# arguments -+examples = """examples: -+ ./alisysdelay # catch tasks hung in kernel mode -+ ./alisysdelay -f 99 # catch tasks hung in kernel mode at 99 Hertz -+ ./alisysdelay -t 100 # catch tasks that hung in kernel mode for more than 100us -+""" -+parser = argparse.ArgumentParser( -+ description="Catch tasks hung in kernel mode and get its stack", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-f", "--frequency", type=positive_nonzero_int, default=100, -+ help="sample frequency in Hertz " -+ "(default %(default)s)") -+parser.add_argument("-d", "--delimited", action="store_true", -+ help="insert delimiter between kernel/user stacks") -+parser.add_argument("-a", "--annotations", action="store_true", -+ help="add _[k] annotations to kernel frames") -+parser.add_argument("-t", "--threshold", type=positive_nonzero_int, default=50, -+ help="set the threshold of delay in kernel mode, in us " -+ "(default %(default)s)") -+parser.add_argument("--stack-storage-size", type=positive_nonzero_int, default=16384, -+ help="the number of unique stack traces that can be stored and displayed " -+ "(default %(default)s)") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+# option logic -+args = parser.parse_args() -+debug = 0 -+ -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct event { -+ u32 pid; -+ u64 kernel_ip; -+ int user_stack_id; -+ int kernel_stack_id; -+ char comm[TASK_COMM_LEN]; -+} event_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); -+BPF_PERF_OUTPUT(events); -+ -+static inline void update_start_time() -+{ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+} -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_enter) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(sched, sched_switch) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+int kprobe___cond_resched(struct pt_regs *ctx) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+// This is copy from arch/x86/asm/ptrace.h -+static inline int user_mode_dump(struct pt_regs *regs) -+{ -+#ifdef CONFIG_X86_32 -+ u64 cs, flags; -+ bpf_probe_read(&cs, sizeof(u64), ®s->cs); -+ bpf_probe_read(&flags, sizeof(u64), ®s->flags); -+ return ((cs & SEGMENT_RPL_MASK) | (flags & X86_VM_MASK)) >= USER_RPL; -+#else -+ u64 cs; -+ bpf_probe_read(&cs, sizeof(u64), ®s->cs); -+ return !!( cs & 3); -+#endif -+} -+ -+static bool need_check_delay(struct pt_regs *regs) -+{ -+ // ignore process 0 -+ if (bpf_get_current_pid_tgid() == 0) { -+ return 0; -+ } -+ -+ // skip when currently cpu is in user mode -+ if (user_mode_dump(regs)) { -+ return 0; -+ } -+ -+ return 1; -+} -+ -+static inline void get_stack_id(struct bpf_perf_event_data *ctx) -+{ -+ event_t key = {.pid = bpf_get_current_pid_tgid()}; -+ bpf_get_current_comm(&key.comm, sizeof(key.comm)); -+ -+ // get stacks -+ key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK); -+ key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0); -+ -+ if (key.kernel_stack_id >= 0) { -+ // populate extras to fix the kernel stack -+ u64 ip = PT_REGS_IP(&ctx->regs); -+ u64 page_offset; -+ -+ // if ip isn't sane, leave key ips as zero for later checking -+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) -+ // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it -+ page_offset = __PAGE_OFFSET_BASE; -+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) -+ // x64, 4.17, and later -+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) -+ page_offset = __PAGE_OFFSET_BASE_L5; -+#else -+ page_offset = __PAGE_OFFSET_BASE_L4; -+#endif -+#else -+ // earlier x86_64 kernels, e.g., 4.6, comes here -+ // arm64, s390, powerpc, x86_32 -+ page_offset = PAGE_OFFSET; -+#endif -+ -+ if (ip > page_offset) { -+ key.kernel_ip = ip; -+ } -+ } -+ -+ events.perf_submit(ctx, &key, sizeof(event_t)); -+} -+ -+int sys_timer(struct bpf_perf_event_data *ctx) { -+ -+ if (need_check_delay(&ctx->regs)) { -+ u32 idx = 0; -+ u64 *tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ if ((bpf_ktime_get_ns() - *tsp) >= (DELAY_THRESHOLD)) { -+ get_stack_id(ctx); -+ } -+ } -+ else { -+ update_start_time(); -+ } -+ -+ return 0; -+} -+""" -+ -+# set stack storage size -+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) -+bpf_text = bpf_text.replace('DELAY_THRESHOLD', str(args.threshold * 1000)) -+ -+ -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# initialize BPF & perf_events -+b = BPF(text=bpf_text) -+b.attach_perf_event(ev_type=PerfType.SOFTWARE, -+ ev_config=PerfSWConfig.CPU_CLOCK, fn_name="sys_timer", -+ sample_freq=args.frequency, pid=-1, cpu=-1, group_fd=-1) -+ -+ -+def aksym(addr): -+ if args.annotations: -+ return b.ksym(addr) + "_[k]".encode() -+ else: -+ return b.ksym(addr) -+ -+TASK_COMM_LEN = 16 -+class Record(ct.Structure): -+ _fields_ = [("pid", ct.c_ulong), -+ ("kernel_ip", ct.c_ulonglong), -+ ("user_stack_id", ct.c_int), -+ ("kernel_stack_id", ct.c_int), -+ ("comm", ct.c_char * TASK_COMM_LEN)] -+ -+ -+def print_stack(cpu, data, size): -+ k = ct.cast(data, ct.POINTER(Record)).contents -+ -+ user_stack = [] if k.user_stack_id < 0 else stack_traces.walk(k.user_stack_id) -+ kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id) -+ -+ # fix kernel stack -+ kernel_stack = [] -+ if k.kernel_stack_id >= 0: -+ for addr in kernel_tmp: -+ kernel_stack.append(addr) -+ # the later IP checking -+ if k.kernel_ip: -+ kernel_stack.insert(0, k.kernel_ip) -+ -+ # print default multi-line stack output -+ if stack_id_err(k.kernel_stack_id): -+ print(" [Missed Kernel Stack]") -+ else: -+ for addr in kernel_stack: -+ print(" %s" % aksym(addr)) -+ if args.delimited and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: -+ print(" --") -+ if stack_id_err(k.user_stack_id): -+ print(" [Missed User Stack]") -+ else: -+ for addr in user_stack: -+ print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) -+ print(" %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid)) -+ print() -+ -+ -+# header -+print("Sampling at %s Hz frequency" % args.frequency, end="") -+print("... Hit Ctrl-C to end.") -+ -+# collect samples -+b["events"].open_perf_buffer(print_stack) -+stack_traces = b["stack_traces"] -+ -+while (1): -+ try: -+ b.perf_buffer_poll() -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alisysdelay_example.txt b/tools/alisysdelay_example.txt -new file mode 100644 -index 0000000..6f8f593 ---- /dev/null -+++ b/tools/alisysdelay_example.txt -@@ -0,0 +1,75 @@ -+alisysdelay is used to catch tasks hung in kernel mode and get its stack. -+ -+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+may be unacceptable, because some tasks may run too long in kernel mode -+without calling cond_sched(). These tasks may be kernel thread, or user -+processes that enter kernel mode by syscall. -+ -+alisysdelay can catch tasks that have been running continuously for a long time -+whithout scheduling, and kernel/user stackes will be printed for these tasks. -+The threshold can be user defined. -+ -+By default CPU idle stacks are excluded by simply excluding PID 0. -+ -+ -+ -+For example, -+ -+# alisysdelay -t 10 -+Sampling at 100 Hz frequency... Hit Ctrl-C to end. -+ -+ iowrite16 -+ iowrite16 -+ vp_notify -+ virtqueue_notify -+ start_xmit -+ dev_hard_start_xmit -+ sch_direct_xmit -+ __qdisc_run -+ __dev_queue_xmit -+ ip_finish_output2 -+ ip_output -+ __tcp_transmit_skb -+ tcp_write_xmit -+ __tcp_push_pending_frames -+ tcp_sendmsg_locked -+ tcp_sendmsg -+ sock_sendmsg -+ sock_write_iter -+ __vfs_write -+ vfs_write -+ ksys_write -+ do_syscall_64 -+ entry_SYSCALL_64_after_hwframe -+ __write_nocancel -+ - sshd (4617) -+ -+In this case, sshd (PID 4617) is catched for staying in kernel mode -+for more than 10us, without rescheduling. -+ -+ -+ -+USAGE message: -+# alisysdelay -h -+usage: alisysdelay.py [-h] [-f FREQUENCY] [-d] [-a] [-t THRESHOLD] -+ [--stack-storage-size STACK_STORAGE_SIZE] -+ -+Catch tasks hung in kernel mode and get its stack -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -f FREQUENCY, --frequency FREQUENCY -+ sample frequency in Hertz (default 100) -+ -d, --delimited insert delimiter between kernel/user stacks -+ -a, --annotations add _[k] annotations to kernel frames -+ -t THRESHOLD, --threshold THRESHOLD -+ set the threshold of delay in kernel mode, in us -+ (default 50) -+ --stack-storage-size STACK_STORAGE_SIZE -+ the number of unique stack traces that can be stored -+ and displayed (default 16384) -+ -+examples: -+ ./alisysdelay # catch tasks hung in kernel mode -+ ./alisysdelay -f 99 # catch tasks hung in kernel mode at 99 Hertz -+ ./alisysdelay -t 100 # catch tasks that hung in kernel mode for more than 100us -diff --git a/tools/alisyslatency.py b/tools/alisyslatency.py -new file mode 100755 -index 0000000..05bfdbd ---- /dev/null -+++ b/tools/alisyslatency.py -@@ -0,0 +1,171 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisyslatency Summarize time spent in kernel context. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisyslatency [-h] [-T] [-N] [-c CPU] [-p PID] [interval] [outputs] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alisyslatency # sum time in kernel context per CPU -+ ./alisyslatency -c 0 # show time in kernel context in CPU 0 only -+ ./alisyslatency -p 25 # show time in kernel context in pid 25 only -+ ./alisyslatency 1 10 # print 1 second summaries, 10 times -+ ./alisyslatency -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+ -+parser = argparse.ArgumentParser( -+ description="Summarize time spent in kernel context.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+ -+thread_group = parser.add_mutually_exclusive_group() -+thread_group.add_argument("-c", "--cpu", -+ help="output statistics on specific CPU only") -+thread_group.add_argument("-p", "--pid", -+ help="output statistics of specific pid only") -+ -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct map_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} map_val_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_HASH(res, u32, map_val_t); -+ -+ -+TRACEPOINT_PROBE(context_tracking, user_exit) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(context_tracking, user_enter) -+{ -+ COND_FILTER -+ -+ u64 *tsp, delta; -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+#if STATISTICS_PER_PROCESS -+ u32 key = bpf_get_current_pid_tgid(); -+#else -+ u32 key = bpf_get_smp_processor_id(); -+#endif -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ map_val_t *valp, val; -+ -+ valp = res.lookup(&key); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+ -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_exp = "1" -+if args.cpu: -+ cond_exp = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+if args.pid: -+ cond_exp = "(u32)bpf_get_current_pid_tgid() == %s" % args.pid -+ -+cond_filter = "if (!(%s)) {return 0;}" % cond_exp -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('STATISTICS_PER_PROCESS', "1" if args.cpu else "0") -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing time in kernel context... Hit Ctrl-C to end.") -+tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ # print header -+ print("%10s %10s %10s %10s" % -+ ("PID" if args.cpu else "CPU", "Count", "TOTAL_" + label, "MAX_" + label)) -+ -+ for k, v in sorted(tab.items(), key=lambda res: res[0]): -+ print("%10d %10d %10d %10d" % -+ (k, v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisyslatency_example.txt b/tools/alisyslatency_example.txt -new file mode 100644 -index 0000000..f5dd2c7 ---- /dev/null -+++ b/tools/alisyslatency_example.txt -@@ -0,0 +1,72 @@ -+Demonstrations of alisyslatency, the Linux eBPF/bcc version. -+ -+ -+This program traces time spent in kernel context (between -+context_tracking:user_exit and context_tracking:user_enter), -+and stores timing statistics in-kernel for efficiency. -+ -+ -+**Note** This tool needs Context tracking subsystem, which can not -+work without support of full dynticks. -+ -+ -+Example 1 -+ -+By default, all statistics of all processes on all CPUs are sampled, -+and the output is displayed in a per-CPU flavor. -+ -+# alisyslatency -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs MAX_usecs -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-process flavor. -+ -+# alisyslatency -c 6 -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ PID Count TOTAL_usecs MAX_usecs -+ -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified process only with "-p" option, -+and the output will be displayed in a per-CPU flavor. -+ -+ -+# alisyslatency -p 3550 -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs MAX_usecs -+ -+ -+ -+USAGE message: -+ -+# alisyslatency -h -+usage: alisyslatency.py [-h] [-T] [-N] [-c CPU | -p PID] [interval] [outputs] -+ -+Summarize time spent in kernel context. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -c CPU, --cpu CPU output statistics on specific CPU only -+ -p PID, --pid PID output statistics of specific pid only -+ -+examples: -+ ./alisyslatency # sum time in kernel context per CPU -+ ./alisyslatency -c 0 # show time in kernel context in CPU 0 only -+ ./alisyslatency -p 25 # show time in kernel context in pid 25 only -+ ./alisyslatency 1 10 # print 1 second summaries, 10 times -+ ./alisyslatency -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/aliworkslower.py b/tools/aliworkslower.py -new file mode 100755 -index 0000000..6c7fffe ---- /dev/null -+++ b/tools/aliworkslower.py -@@ -0,0 +1,137 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# aliworkslower Trace high work sched latency. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: aliworkslower [-h] [threshold] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+import argparse -+import ctypes as ct -+ -+# arguments -+examples = """examples: -+ ./aliworkslower # trace work sched latency higher than 10000 us (default) -+ ./aliworkslower 1000 # trace work sched latency higher than 1000 us -+""" -+parser = argparse.ArgumentParser( -+ description="Trace high work sched latency", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("threshold", nargs="?", type=int, default=10000, -+ help="threshold of Q2S latency, in microsecond") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct work_timestamp { -+ u64 time_queue_work; -+ u64 time_activate_work; -+} work_timestamp_t; -+ -+typedef struct work_latency { -+ u64 pid; // pid of worker thread -+ u64 func; // function pointer of the work callback -+ u64 q2a; // queue_work to activate_work -+ u64 a2s; // activate_work to execute_start -+} work_latency_t; -+ -+BPF_HASH(res, u64, work_timestamp_t); -+BPF_PERF_OUTPUT(events); -+ -+TRACEPOINT_PROBE(workqueue, workqueue_queue_work) -+{ -+ u64 work = (u64)args->work; -+ work_timestamp_t stamp = { -+ .time_queue_work = bpf_ktime_get_ns(), -+ }; -+ res.update(&work, &stamp); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(workqueue, workqueue_activate_work) -+{ -+ u64 work = (u64)args->work; -+ work_timestamp_t *stampp = res.lookup(&work); -+ -+ if (stampp) { -+ stampp->time_activate_work = bpf_ktime_get_ns(); -+ } -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(workqueue, workqueue_execute_start) -+{ -+ u64 work = (u64)args->work; -+ u64 now = bpf_ktime_get_ns(); -+ work_timestamp_t *stampp = res.lookup(&work); -+ -+ if (stampp) { -+ u64 delta = now - stampp->time_queue_work; -+ if (delta > THRESHOLD) { -+ work_latency_t latency = { -+ .pid = (u32)bpf_get_current_pid_tgid(), -+ .func = (u64)args->function, -+ .q2a = stampp->time_activate_work - stampp->time_queue_work, -+ .a2s = now - stampp->time_activate_work, -+ }; -+ events.perf_submit(args, &latency, sizeof(work_latency_t)); -+ } -+ } -+ -+ return 0; -+} -+""" -+ -+bpf_text = bpf_text.replace('THRESHOLD', "%d" % (args.threshold * 1000)) -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+class Record(ct.Structure): -+ _fields_ = [("pid", ct.c_ulong), -+ ("func", ct.c_ulonglong), -+ ("q2a", ct.c_ulonglong), -+ ("a2s", ct.c_ulonglong)] -+ -+ -+def print_record(cpu, data, size): -+ event = ct.cast(data, ct.POINTER(Record)).contents -+ print("%10s %20s %25d %25d %25d" % (event.pid, -+ b.ksym(event.func), -+ (event.q2a + event.a2s) / 1000, -+ event.q2a / 1000, -+ event.a2s / 1000)) -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing high work sched latency... Hit Ctrl-C to end.") -+print("%10s %20s %25s %25s %25s" % -+ ("Worker Thread", "Work", "Total Q2S latency (us)", "Q2A latency (us)", "A2S latency (us)")) -+ -+b["events"].open_perf_buffer(print_record) -+ -+ -+# output -+while (1): -+ try: -+ b.perf_buffer_poll() -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/aliworkslower_example.txt b/tools/aliworkslower_example.txt -new file mode 100644 -index 0000000..21775d0 ---- /dev/null -+++ b/tools/aliworkslower_example.txt -@@ -0,0 +1,39 @@ -+aliworkslower shows high latency of work between workqueue_queue_work -+and workqueue_execute_start. -+ -+Users can specify a threshhold, and whenever the latency between -+workqueue_queue_work and workqueue_execute_start exceeds this -+threshold, related information of this work is outputed. -+ -+For example, -+ -+# ./aliworkslower 15 -+Tracing high work sched latency... Hit Ctrl-C to end. -+Worker Thread Work Total Q2S latency (us) Q2A latency (us) A2S latency (us) -+ 19783 vmstat_update 36 0 35 -+ 29195 vmstat_update 16 0 15 -+ -+- column "Worker Thread": PID of the worker thread -+- column "Work": work callback -+- column "Total Q2S latency": latency between workqueue_queue_work (Q) -+and workqueue_execute_start (S), in us -+- column "Q2A latency": latency between workqueue_queue_work (Q) -+and workqueue_activate_work (A), in us -+- column "A2S latency": latency between workqueue_activate_work (A) -+and workqueue_execute_start (S), in us -+ -+ -+# aliworkslower --help -+usage: aliworkslower.py [-h] [threshold] -+ -+Trace high work sched latency -+ -+positional arguments: -+ threshold threshold of Q2S latency, in microsecond -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -+examples: -+ ./aliworkslower # trace work sched latency higher than 10000 us (default) -+ ./aliworkslower 1000 # trace work sched latency higher than 1000 us --- -2.27.0 - diff --git a/1001-Skiping-cases-in-libbpf-tools.patch b/1001-Skiping-cases-in-libbpf-tools.patch deleted file mode 100644 index 1afa2d8dab842f9e436b61dae8b53ebe18746402..0000000000000000000000000000000000000000 --- a/1001-Skiping-cases-in-libbpf-tools.patch +++ /dev/null @@ -1,26 +0,0 @@ -From f67a5908c55568e369e4ab5b36b90aeb38c93caa Mon Sep 17 00:00:00 2001 -Message-Id: -From: "Kun(llfl)" -Date: Thu, 9 Feb 2023 14:53:53 +0800 -Subject: [PATCH 1001/1001] Skiping cases in libbpf-tools - -Signed-off-by: Kun(llfl) ---- - libbpf-tools/Makefile | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile -index a48b4e2..bb4b012 100644 ---- a/libbpf-tools/Makefile -+++ b/libbpf-tools/Makefile -@@ -57,7 +57,6 @@ APPS = \ - funclatency \ - gethostlatency \ - hardirqs \ -- javagc \ - klockstat \ - ksnoop \ - llcstat \ --- -2.39.0 - diff --git a/1003-add-u64-define.patch b/1003-add-u64-define.patch deleted file mode 100644 index 7132712ea32c9220745b99ae5517d78bc6c21bac..0000000000000000000000000000000000000000 --- a/1003-add-u64-define.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 57ef40d8219e62f791f25186077cc6fc135e246c Mon Sep 17 00:00:00 2001 -From: "songnannan.snn@alibaba-inc.com" -Date: Sun, 2 Apr 2023 07:32:35 +0800 -Subject: [PATCH 1/1] add u64 define - ---- - libbpf-tools/wakeuptime.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/libbpf-tools/wakeuptime.h b/libbpf-tools/wakeuptime.h -index 3c5376a..3f86907 100644 ---- a/libbpf-tools/wakeuptime.h -+++ b/libbpf-tools/wakeuptime.h -@@ -5,6 +5,7 @@ - - #define MAX_ENTRIES 10240 - #define TASK_COMM_LEN 16 -+#define u64 unsigned long long - - struct key_t { - char waker[TASK_COMM_LEN]; --- -2.31.1 - diff --git a/1004-fix-man-pages-installation.patch b/1004-fix-man-pages-installation.patch deleted file mode 100644 index d975f72293d5569c7e92a7efa93827e95bae43ad..0000000000000000000000000000000000000000 --- a/1004-fix-man-pages-installation.patch +++ /dev/null @@ -1,16 +0,0 @@ ---- bcc-0.27.0/man/man8/CMakeLists.txt.orig 2023-04-17 22:30:03.667858093 +0800 -+++ bcc-0.27.0/man/man8/CMakeLists.txt 2023-04-17 22:30:37.377858093 +0800 -@@ -1,12 +1,2 @@ --find_program(GZIP gzip) - file(GLOB FILES *.8) --set(GZFILES "") --foreach(FIL ${FILES}) -- get_filename_component(NAME ${FIL} NAME) -- add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz -- COMMAND ${GZIP} -c ${FIL} > ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz -- DEPENDS ${FIL}) -- list(APPEND GZFILES "${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz") --endforeach() --add_custom_target(man ALL DEPENDS ${GZFILES}) --install(FILES ${GZFILES} DESTINATION share/bcc/man/man8) -+install(FILES ${FILES} DESTINATION ${CMAKE_INSTALL_MANDIR}/man8) diff --git a/bcc.spec b/bcc.spec index 7e6a23ad8ce8949b924e1543c9d0eb1cc690cee2..46eb78e80952364f3eb01398a054a3365378f9b2 100644 --- a/bcc.spec +++ b/bcc.spec @@ -1,4 +1,4 @@ -%define anolis_release 5 +%define anolis_release 1 %ifarch loongarch64 %bcond_without ignore_lua %bcond_with libbpf_tools @@ -9,17 +9,17 @@ Name: bcc -Version: 0.27.0 +Version: 0.30.0 Release: %{anolis_release}%{?dist} Summary: BPF Compiler Collection (BCC) License: ASL 2.0 URL: https://github.com/iovisor/bcc Source0: %{url}/archive/refs/tags/v%{version}.tar.gz -Patch1000: 1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch -Patch1001: 1001-Skiping-cases-in-libbpf-tools.patch -Patch1003: 1003-add-u64-define.patch -Patch1004: 1004-fix-man-pages-installation.patch +#Patch1000: 1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch +#Patch1001: 1001-Skiping-cases-in-libbpf-tools.patch +#Patch1003: 1003-add-u64-define.patch +#Patch1004: 1004-fix-man-pages-installation.patch BuildRequires: cmake >= 2.8.12 llvm-devel clang-devel llvm-googletest BuildRequires: flex bison python3-setuptools @@ -131,10 +131,12 @@ find %{buildroot}%{_datadir}/%{name}/{tools,examples} -type f -exec \ -e '1s=^#!/usr/bin/env python\([0-9.]\+\)\?$=#!%{__python3}=' \ -e '1s=^#!/usr/bin/env bcc-lua$=#!/usr/bin/bcc-lua=' {} \; -# Move man pages to the right location to avoid conflict with other manpages -pushd %{buildroot}%{_mandir}/man8 - find . -name '*.8' -exec bash -c 'mv $0 bcc-`basename $0`' {} \; -popd +mkdir -p %{buildroot}%{_mandir} +mv %{buildroot}%{_datadir}/%{name}/man/* %{buildroot}%{_mandir}/ +for i in `find %{buildroot}%{_mandir} -name "*.gz"`; do + tname=$(basename $i) + rename $tname %{name}-$tname $i +done mkdir -p %{buildroot}%{_docdir}/%{name} mv %{buildroot}%{_datadir}/%{name}/examples %{buildroot}%{_docdir}/%{name}/ @@ -189,6 +191,9 @@ install -p libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/ %endif %changelog +* Wed May 08 2024 mgb01105731 - 0.30.0-1 +- update to 0.30.0 + * Thu Mar 28 2024 Zhao Hang - 0.27.0-5 - Rebuild with python3.11 diff --git a/v0.27.0.tar.gz b/v0.30.0.tar.gz similarity index 42% rename from v0.27.0.tar.gz rename to v0.30.0.tar.gz index ace68f77c057b7c9ac17e8909fd6dcf7f24d5cb9..d8ddab4d0aa3238bc12594f79b9961ed99cb77b2 100644 Binary files a/v0.27.0.tar.gz and b/v0.30.0.tar.gz differ