diff --git a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch b/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
deleted file mode 100644
index e6348c7bed408c027b6313ace99cb1054207780a..0000000000000000000000000000000000000000
--- a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
+++ /dev/null
@@ -1,6172 +0,0 @@
-diff --git a/man/man8/alibiolatency.8 b/man/man8/alibiolatency.8
-new file mode 100644
-index 0000000..93f52fa
---- /dev/null
-+++ b/man/man8/alibiolatency.8
-@@ -0,0 +1,76 @@
-+.TH alibiolatency 8  "2019-08-25" "USER COMMANDS"
-+.SH NAME
-+alibiolatency \- Summarize block device I/O latency.
-+.SH SYNOPSIS
-+.B alibiolatency [\-h] [\-i interval] [-d device] [\-t avg_req_time] [\-T req_time] [-r]
-+.SH DESCRIPTION
-+
-+Alibiolatency can trace every io request's life cycle and do data filter and statistics
-+accordingly. With this tool, we can clearly know io request's different stage's latency
-+proportion, for example, if io request D2C(see blktrace) time take majority in total io
-+request process time, it's most likely disk bottleneck has been reached.
-+
-+This tool uses in-kernel eBPF maps for storing timestamps for efficiency.
-+
-+This works by tracing various kernel block tracepoints using dynamic tracing,
-+and will need updating to match any changes to these tracepoints.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+\-h
-+Print usage message.
-+.TP
-+\-i interval
-+Specify display interval, unit is second.
-+.TP
-+\-r
-+Dump every io request life cycle.
-+.TP
-+\-d device
-+Inspect specified device.
-+.TP
-+\-t avg_req_time
-+Display only when average request process time is greater than this value, unit is ms.
-+.TP
-+\-T req_time
-+Dump request life cycle when single request process time is greater than this value, unit is ms.
-+.SH EXAMPLES
-+.TP
-+Summarize block device I/O latency(default display interval is 2 seconds):
-+#
-+.B alibiolatency
-+.TP
-+Print statistics per 5 seconds
-+#
-+.B alibiolatency \-i 5
-+.TP
-+Print only when average request process time is greater than 10ms
-+#
-+.B alibiolatency \-t 10
-+.TP
-+Print request life cycle when single request process time is greater than 20ms.
-+#
-+.B alibiolatency \-T 20
-+.SH OVERHEAD
-+This traces kernel functions and maintains in-kernel timestamps and a histogram,
-+which are asynchronously copied to user-space. This method is very efficient,
-+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
-+If you have a higher IOPS storage environment, test and quantify the overhead
-+before use.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Xiaoguang Wang
-+.SH SEE ALSO
-+biosnoop(8)
-diff --git a/man/man8/aliexitsnoop.8 b/man/man8/aliexitsnoop.8
-new file mode 100644
-index 0000000..cd7215e
---- /dev/null
-+++ b/man/man8/aliexitsnoop.8
-@@ -0,0 +1,109 @@
-+.TH aliexitsnoop 8  "2019-05-28" "USER COMMANDS"
-+.SH NAME
-+aliexitsnoop \- Trace all process termination (exit, fatal signal). Uses Linux eBPF/bcc.
-+.SH SYNOPSIS
-+.B aliexitsnoop [\-h] [\-t] [\-\-utc] [\-x] [\-p PID] [\-\-label LABEL]
-+.SH DESCRIPTION
-+aliexitsnoop traces process termination, showing the command name and reason for
-+termination, either an exit or a fatal signal. This Linux tool is actually an enhanced
-+version of the exitsnoop. For more details about exitsnoop, see exitsnoop_example.txt.
-+Compared to the exitsnoop, aliexitsnoop not only print the exit code, pid, ppid, lifetime
-+of the exiting processes, but also the stack of the exiting processes. The demand stems
-+from the fact that in the production environment, some processes end up unexpectedly.
-+Therefore we need to dump the stack of the target processes to find out what caused the
-+target process to exit.
-+
-+It catches processes of all users, processes in containers, as well
-+as processes that become zombie.
-+
-+This works by tracing the kernel do_exit() function,
-+and will need updating to match any changes to this function.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-t
-+Include a timestamp column.
-+.TP
-+\-\-utc
-+Include a timestamp column, use UTC timezone.
-+.TP
-+\-x
-+Exclude successful exits, exit( 0 )
-+.TP
-+\-p PID
-+Trace this process ID only (filtered in-kernel).
-+.TP
-+\-\-label LABEL
-+Label each line with LABEL (default 'exit') in first column (2nd if timestamp is present).
-+.SH EXAMPLES
-+.TP
-+Trace all process termination
-+#
-+.B aliexitsnoop
-+.TP
-+Trace all process termination, and include timestamps:
-+#
-+.B aliexitsnoop \-t
-+.TP
-+Exclude successful exits, only include non-zero exit codes and fatal signals:
-+#
-+.B aliexitsnoop \-x
-+.TP
-+Trace PID 181 only:
-+#
-+.B aliexitsnoop \-p 181
-+.TP
-+Label each output line with 'EXIT':
-+#
-+.B aliexitsnoop \-\-label EXIT
-+.SH FIELDS
-+.TP
-+TIME-TZ
-+Time of process termination HH:MM:SS.sss with milliseconds, where TZ is
-+the local time zone, 'UTC' with \-\-utc option.
-+.TP
-+LABEL
-+The optional label if \-\-label option is used.  This is useful with the
-+\-t option for timestamps when the output of several tracing tools is
-+sorted into one combined output.
-+.TP
-+PCOMM
-+Process/command name.
-+.TP
-+PID
-+Process ID
-+.TP
-+PPID
-+The process ID of the process that will be notified of PID termination.
-+.TP
-+TID
-+Thread ID.
-+.TP
-+EXIT_CODE
-+The exit code for exit() or the signal number for a fatal signal.
-+.SH OVERHEAD
-+This traces the kernel do_exit() function and prints output for each event.
-+As the rate of this is generally expected to be low (< 1000/s), the overhead is also
-+expected to be negligible. If you have an application that has a high rate of
-+process termination, then test and understand overhead before use.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Alvin Zheng
-+.SH SEE ALSO
-+aliexecsnoop(8)
-diff --git a/man/man8/aliext4writeslower.8 b/man/man8/aliext4writeslower.8
-new file mode 100644
-index 0000000..007a474
---- /dev/null
-+++ b/man/man8/aliext4writeslower.8
-@@ -0,0 +1,70 @@
-+.TH aliext4writeslower 8  "2019-08-30" "USER COMMANDS"
-+.SH NAME
-+aliext4writeslower \- Trace common ext4 write(2) slower than a threshold
-+.SH SYNOPSIS
-+.B aliext4writeslower [-h] [-p PID] [-d device] [min_ms]
-+.SH DESCRIPTION
-+
-+This script traces key steps of write(2) which maybe time-consuming,
-+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction
-+checkpoint or commit), page cache allocation, block allocation and
-+balance dirty pages, and we also take into account sched delay.
-+
-+With this tool, we can clearly know write(2)'s latency proportion.
-+
-+This tool uses in-kernel eBPF maps for storing timestamps for efficiency.
-+
-+This works by tracing various kernel functions using dynamic tracing,
-+and will need updating to match any changes to these tracepoints.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+\-h
-+Print usage message.
-+.TP
-+\-i pid
-+trace this PID only
-+.TP
-+min_ms
-+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
-+.SH EXAMPLES
-+.TP
-+Trace write(2) slower than 10 ms (default).
-+#
-+.B aliext4writeslower
-+.TP
-+Trace write(2) slower than 5 ms.
-+#
-+.B aliext4writeslower 5
-+.TP
-+Trace PID 185 only.
-+#
-+.B aliext4writeslower -p 185
-+.TP
-+Trace /dev/sda5 which contains ext4 fs.
-+#
-+.B aliext4writeslower -d sda5
-+.TP
-+.SH OVERHEAD
-+This traces kernel functions and maintains in-kernel timestamps and a histogram,
-+which are asynchronously copied to user-space. This method is very efficient,
-+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
-+If you have a higher IOPS storage environment, test and quantify the overhead
-+before use.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Xiaoguang Wang
-+.SH SEE ALSO
-+biosnoop(8)
-diff --git a/man/man8/alihardirqs.8 b/man/man8/alihardirqs.8
-new file mode 100644
-index 0000000..d874e89
---- /dev/null
-+++ b/man/man8/alihardirqs.8
-@@ -0,0 +1,99 @@
-+.TH alihardirqs 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alihardirqs \- Summarize hard IRQ (interrupt) event time.
-+.SH SYNOPSIS
-+.B alihardirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs]
-+.SH DESCRIPTION
-+This program summarizes hard irq statistics.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-T
-+Include timestamps on output.
-+.TP
-+\-N
-+Output in nanosecond.
-+.TP
-+\-D
-+Show distributions as histograms.
-+.TP
-+\-c
-+Show hardIRQ statistics on specific CPU only.
-+.TP
-+\-i
-+Show statistics of specific hardIRQ only.
-+.TP
-+interval
-+Output interval, in seconds.
-+.TP
-+outputs
-+Number of outputs.
-+.SH EXAMPLES
-+.TP
-+Summarize hardIRQ statistics:
-+#
-+.B alihardirqs
-+.TP
-+Print 1 second summaries, 10 times:
-+#
-+.B alihardirqs 1 10
-+.TP
-+Print 1s summaries, nanoseconds, and timestamps:
-+#
-+.B alihardirqs \-NT 1
-+.TP
-+Show hardIRQ statistics as histograms:
-+#
-+.B alihardirqs \-D
-+.TP
-+Show hardIRQ statistics of CPU 0 only:
-+#
-+.B alihardirqs \-c 0
-+.TP
-+Show statistics of hardIRQ 25 only:
-+#
-+.B alihardirqs \-i 25
-+.SH FIELDS
-+.TP
-+CPU
-+CPU index.
-+.TP
-+HARDIRQ
-+hardIRQ number.
-+.TP
-+IRQHandler
-+Name of hardIRQ handler.
-+.TP
-+Count
-+Number of samples on specific CPU or hardIRQ.
-+.TP
-+Total_<usecs/nsecs>
-+Number of time spent on specific hardIRQ.
-+.TP
-+MAX_<usecs/nsecs>
-+Maximum of time spent on specific hardIRQ.
-+.TP
-+distribution
-+An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
-+.SH OVERHEAD
-+This sampls at every irq_handler_entry/irq_handler_exit tracepoint, and in-kernel
-+summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/alijbd2stat.8 b/man/man8/alijbd2stat.8
-new file mode 100644
-index 0000000..4f2de07
---- /dev/null
-+++ b/man/man8/alijbd2stat.8
-@@ -0,0 +1,61 @@
-+.TH alijbd2stat 8  "2019-08-25" "USER COMMANDS"
-+.SH NAME
-+alijbd2stat \- Summarize filesystem jbd2 stat.
-+.SH SYNOPSIS
-+.B alijbd2stat [\-h] [\-d device] [\-t thresh]
-+.SH DESCRIPTION
-+
-+Alijbd2stat can trace every jbd2 handle and every transaction commit info.
-+
-+If one jbd2 handle's running time is greater than user specified threshold time, Alijbd2stat
-+will catch this handle's context.
-+
-+This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
-+for efficiency.
-+
-+This works by tracing various kernel block tracepoints using dynamic tracing,
-+and will need updating to match any changes to these tracepoints.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+\-h
-+Print usage message.
-+.TP
-+\-d device
-+Inspect specified device.
-+.TP
-+\-t thresh
-+Show jbd2 handle's context when its run time is greater than this threshold.
-+.SH EXAMPLES
-+.TP
-+Print all jbd2 instances stat.
-+#
-+.B alijbd2stat
-+.TP
-+Inspect specified device /dev/sda3, on which some filesystem uses jbd2.
-+#
-+.B alijbd2stat \-d sda3
-+.TP
-+Show jbd2 handle's context when it runs more than 10ms
-+#
-+.B alijbd2stat \-t 10
-+.TP
-+.SH OVERHEAD
-+Test and quantify the overhead before use.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Xiaoguang Wang
-+.SH SEE ALSO
-+biosnoop(8)
-diff --git a/man/man8/alimutexsnoop.8 b/man/man8/alimutexsnoop.8
-new file mode 100644
-index 0000000..f0342b4
---- /dev/null
-+++ b/man/man8/alimutexsnoop.8
-@@ -0,0 +1,71 @@
-+.TH alimutexsnoop 8  "2019-05-28" "USER COMMANDS"
-+.SH NAME
-+alimutexsnoop \- Trace the mutex that is held by the process longer than a specified time threshold. Uses Linux eBPF/bcc.
-+.SH SYNOPSIS
-+.B alimutexsnoop [\-h] [\-t THRESHOLD] [\-p PID]
-+.SH DESCRIPTION
-+alimutexsnoop monitors the mutex that is held by the process longer than
-+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes,
-+it prints the pid of the process that held the mutex and the address of the
-+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the
-+process.
-+
-+This works by tracing the kernel mutex_lock() and mutex_unlock() function,
-+and will need updating to match any changes to this function.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-t
-+Set the time threshold. The mutexes held longer than this time threshold will be traced(us).
-+.TP
-+\-p PID
-+Trace this process ID only (filtered in-kernel).
-+.SH EXAMPLES
-+.TP
-+Trace all mutexes that are held longer than 1000us (default):
-+#
-+.B alimutexsnoop
-+.TP
-+Trace all mutexes that are held longer than 50us:
-+#
-+.B alimutexsnoop \-t 50
-+.TP
-+Trace PID 181 only:
-+#
-+.B alimutexsnoop \-p 181
-+.SH FIELDS
-+.TP
-+PCOMM
-+Process/command name.
-+.TP
-+PID
-+Process ID
-+.TP
-+MUTEX_ADDR
-+The address of the mutex object.
-+.SH OVERHEAD
-+This traces the kernel mutex_lock() and mutex_unlock() function and compute the time
-+length of the process holds the mutex. The overhead is positively related to the frequency
-+of acquiring and releasing mutex. If you have an application that gets and releases mutexes
-+frequently, then test and understand overhead before use.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Alvin Zheng
-+.SH SEE ALSO
-+alimutexsnoop(8)
-diff --git a/man/man8/alirunqinfo.8 b/man/man8/alirunqinfo.8
-new file mode 100644
-index 0000000..1ffa3cb
---- /dev/null
-+++ b/man/man8/alirunqinfo.8
-@@ -0,0 +1,109 @@
-+.TH alirunqinfo 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alirunqinfo \- Summarize runqueue statistics.
-+.SH SYNOPSIS
-+.B alirunqinfo [\-h] [\-T] [\-N] [\-D] [\-C] [\-P] [\-c CPU] [\-p PID] [interval] [outputs]
-+.SH DESCRIPTION
-+This program summarizes task on-CPU time before being descheduled.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-T
-+Include timestamps on output.
-+.TP
-+\-N
-+Output in nanosecond.
-+.TP
-+\-D
-+Show distributions as histograms.
-+.TP
-+\-C
-+Show per-CPU runqueue statistics.
-+.TP
-+\-P
-+Show per-thread runqueue statistics.
-+.TP
-+\-c
-+Show runqueue statistics on specific CPU only.
-+.TP
-+\-p
-+Show runqueue statistics of specific PID only.
-+.TP
-+interval
-+Output interval, in seconds.
-+.TP
-+outputs
-+Number of outputs.
-+.SH EXAMPLES
-+.TP
-+Summarize runqueue statistics:
-+#
-+.B alirunqinfo
-+.TP
-+Print 1 second summaries, 10 times:
-+#
-+.B alirunqinfo 1 10
-+.TP
-+Print 1s summaries, nanoseconds, and timestamps:
-+#
-+.B alirunqinfo \-NT 1
-+.TP
-+Show per-CPU runqueue statistics:
-+#
-+.B alirunqinfo \-C
-+.TP
-+Show per-thread runqueue statistics:
-+#
-+.B alirunqinfo \-P
-+.TP
-+Show runqueue statistics as histograms:
-+#
-+.B alirunqinfo \-D
-+.TP
-+Show runqueue statistics of CPU 0 only:
-+#
-+.B alirunqinfo \-c 0
-+.TP
-+Show runqueue statistics of PID 25 only:
-+#
-+.B alirunqinfo \-p 25
-+.SH FIELDS
-+.TP
-+CPU
-+CPU index.
-+.TP
-+PID
-+PID of sampled process.
-+.TP
-+Thread
-+Name of the specified process.
-+.TP
-+Count
-+Number of samples on specific CPU or PID.
-+.TP
-+Total_<usecs/nsecs>
-+Number of task on-CPU time accumulated on specific CPU or PID.
-+.TP
-+distribution
-+An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
-+.SH OVERHEAD
-+This sampls at every sched_switch tracepoint, and in-kernel summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/alisoftirqs.8 b/man/man8/alisoftirqs.8
-new file mode 100644
-index 0000000..1fc9c50
---- /dev/null
-+++ b/man/man8/alisoftirqs.8
-@@ -0,0 +1,99 @@
-+.TH alisoftirqs 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alisoftirqs \- Summarize soft IRQ (interrupt) event time.
-+.SH SYNOPSIS
-+.B alisoftirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs]
-+.SH DESCRIPTION
-+This program summarizes soft irq statistics.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-T
-+Include timestamps on output.
-+.TP
-+\-N
-+Output in nanosecond.
-+.TP
-+\-D
-+Show distributions as histograms.
-+.TP
-+\-c
-+Show softIRQ statistics on specific CPU only.
-+.TP
-+\-i
-+Show statistics of specific softIRQ only.
-+.TP
-+interval
-+Output interval, in seconds.
-+.TP
-+outputs
-+Number of outputs.
-+.SH EXAMPLES
-+.TP
-+Summarize softIRQ statistics:
-+#
-+.B alisoftirqs
-+.TP
-+Print 1 second summaries, 10 times:
-+#
-+.B alisoftirqs 1 10
-+.TP
-+Print 1s summaries, nanoseconds, and timestamps:
-+#
-+.B alisoftirqs \-NT 1
-+.TP
-+Show softIRQ statistics as histograms:
-+#
-+.B alisoftirqs \-D
-+.TP
-+Show softIRQ statistics of CPU 0 only:
-+#
-+.B alisoftirqs \-c 0
-+.TP
-+Show statistics of softIRQ 25 only:
-+#
-+.B alisoftirqs \-i 25
-+.SH FIELDS
-+.TP
-+CPU
-+CPU index.
-+.TP
-+SOFTIRQ
-+softIRQ number.
-+.TP
-+IRQHandler
-+Name of softIRQ handler.
-+.TP
-+Count
-+Number of samples on specific CPU or softIRQ.
-+.TP
-+Total_<usecs/nsecs>
-+Number of time spent on specific softIRQ.
-+.TP
-+MAX_<usecs/nsecs>
-+Maximum of time spent on specific softIRQ.
-+.TP
-+distribution
-+An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
-+.SH OVERHEAD
-+This sampls at every softirq_entry/softirq_exit tracepoint, and in-kernel
-+summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/alisyscount.8 b/man/man8/alisyscount.8
-new file mode 100644
-index 0000000..284084a
---- /dev/null
-+++ b/man/man8/alisyscount.8
-@@ -0,0 +1,112 @@
-+.TH alisyscount 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alisyscount \- Summarize syscall counts and latencies.
-+.SH SYNOPSIS
-+.B alisyscount [\-h] [\-T] [\-N] [\-D] [\-S] [\-P] [\-s syscall] [\-p PID] [interval] [outputs]
-+.SH DESCRIPTION
-+This program summarizes syscall counts and latencies.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-T
-+Include timestamps on output.
-+.TP
-+\-N
-+Output in nanosecond.
-+.TP
-+\-D
-+Show distributions as histograms.
-+.TP
-+\-S
-+Show per-syscall statistics.
-+.TP
-+\-P
-+Show per-thread statistics.
-+.TP
-+\-s
-+Show statistics of specific syscall only.
-+.TP
-+\-p
-+Show syscall statistics of specific PID only.
-+.TP
-+interval
-+Output interval, in seconds.
-+.TP
-+outputs
-+Number of outputs.
-+.SH EXAMPLES
-+.TP
-+Summarize syscall statistics:
-+#
-+.B alisyscount
-+.TP
-+Print 1 second summaries, 10 times:
-+#
-+.B alisyscount 1 10
-+.TP
-+Print 1s summaries, nanoseconds, and timestamps:
-+#
-+.B alisyscount \-NT 1
-+.TP
-+Show syscall statistics as histograms:
-+#
-+.B alisyscount \-D
-+.TP
-+Show per-syscall statistics:
-+#
-+.B alisyscount \-S
-+.TP
-+Show per-thread syscall statistics:
-+#
-+.B alisyscount \-P
-+.TP
-+Show statistics of syscall 25 only:
-+#
-+.B alisyscount \-s 25
-+.TP
-+Show syscall statistics of PID 1213 only:
-+#
-+.B alisyscount \-p 1213
-+.SH FIELDS
-+.TP
-+SysNR
-+syscall number.
-+.TP
-+Syscall
-+Name of syscall.
-+.TP
-+PID
-+PID of specific process.
-+.TP
-+Thread
-+Name of specific process.
-+.TP
-+Count
-+Number of samples on specific PID or syscall.
-+.TP
-+Total_<usecs/nsecs>
-+Number of time spent on specific syscall.
-+.TP
-+distribution
-+An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
-+.SH OVERHEAD
-+This sampls at every sys_enter/sys_exit tracepoint, and in-kernel summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/alisysdelay.8 b/man/man8/alisysdelay.8
-new file mode 100644
-index 0000000..a7d91aa
---- /dev/null
-+++ b/man/man8/alisysdelay.8
-@@ -0,0 +1,71 @@
-+.TH alisysdelay 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alisysdelay \- Catches tasks hung in kernel mode and get its stack.
-+.SH SYNOPSIS
-+.B alisysdelay [\-h] [\-d] [\-a] [\-f frequency] [\-t threshold] [\-\-stack\-storage\-size size]
-+.SH DESCRIPTION
-+This program catches tasks hung in kernel mode and get its stack.
-+
-+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
-+may be unacceptable, because some tasks may run too long in kernel mode
-+without calling cond_sched(). These tasks may be kernel thread, or user
-+processes that enter kernel mode by syscall.
-+
-+alisysdelay can catch tasks that have been running continuously for a long time
-+whithout scheduling, and kernel/user stackes of these tasks will be printed.
-+The threshold can be user defined.
-+
-+By default CPU idle stacks are excluded by simply excluding PID 0.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-d
-+Insert delimiter between kernel/user stacks.
-+.TP
-+\-a
-+Add _[k] annotations to kernel frames.
-+.TP
-+\-f
-+Sample frequency in Hertz.
-+.TP
-+\-t
-+Set the threshold of delay in kernel mode, in us.
-+.TP
-+\-\-stack\-storage\-size
-+The number of unique stack traces that can be stored and displayed.
-+.SH EXAMPLES
-+.TP
-+Catches tasks hung in kernel mode and get its stack:
-+#
-+.B alisysdelay
-+.TP
-+Catches tasks hung in kernel mode at 99 Hz:
-+#
-+.B alisysdelay \-f 99
-+.TP
-+Catches tasks that hung in kernel mode for more than 100us:
-+#
-+.B alisysdelay \-t 100
-+.SH FIELDS
-+.SH OVERHEAD
-+This sampls at every sys_enter/sched_switch tracepoint and _cond_resched kprobe,
-+and in-kernel summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/alisyslatency.8 b/man/man8/alisyslatency.8
-new file mode 100644
-index 0000000..854b7a0
---- /dev/null
-+++ b/man/man8/alisyslatency.8
-@@ -0,0 +1,85 @@
-+.TH alisyslatency 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+alisyslatency \- Summarize time spent in kernel context..
-+.SH SYNOPSIS
-+.B alisyslatency [\-h] [\-T] [\-N] [\-c CPU] [\-p PID] [interval] [outputs]
-+.SH DESCRIPTION
-+This program summarizes time spent in kernel context..
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+\-T
-+Include timestamps on output.
-+.TP
-+\-N
-+Output in nanosecond.
-+.TP
-+\-c
-+Show statistics on specific CPU only.
-+.TP
-+\-p
-+Show statistics of specific PID only.
-+.TP
-+interval
-+Output interval, in seconds.
-+.TP
-+outputs
-+Number of outputs.
-+.SH EXAMPLES
-+.TP
-+Summarize time in kernel context per CPU:
-+#
-+.B alisyslatency
-+.TP
-+Print 1 second summaries, 10 times:
-+#
-+.B alisyslatency 1 10
-+.TP
-+Print 1s summaries, nanoseconds, and timestamps:
-+#
-+.B alisyslatency \-NT 1
-+.TP
-+Show time in kernel context in CPU 0 only:
-+#
-+.B alisyslatency \-c 0
-+.TP
-+Show time in kernel context in pid 25 only:
-+#
-+.B alisyslatency \-p 25
-+.SH FIELDS
-+.TP
-+CPU
-+CPU index.
-+.TP
-+PID
-+PID of sampled process.
-+.TP
-+Count
-+Number of samples on specific CPU or PID.
-+.TP
-+Total_<usecs/nsecs>
-+Number of time spent on specific PID.
-+.TP
-+MAX_<usecs/nsecs>
-+Maximum of time spent on specific PID.
-+.SH OVERHEAD
-+This sampls at every user_exit/user_enter tracepoint, and in-kernel summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/man/man8/aliworkslower.8 b/man/man8/aliworkslower.8
-new file mode 100644
-index 0000000..4dac8fb
---- /dev/null
-+++ b/man/man8/aliworkslower.8
-@@ -0,0 +1,59 @@
-+.TH aliworkslower 8  "2019-7-25" "USER COMMANDS"
-+.SH NAME
-+aliworkslower \- Trace high work sched latency.
-+.SH SYNOPSIS
-+.B aliworkslower [\-h] [threshold]
-+.SH DESCRIPTION
-+This program traces high work sched latency.
-+
-+Since this uses BPF, only the root user can use this tool.
-+.SH REQUIREMENTS
-+CONFIG_BPF and bcc.
-+.SH OPTIONS
-+.TP
-+\-h
-+Print usage message.
-+.TP
-+[threshold]
-+Set threshold of Q2S latency, in microsecond.
-+.SH EXAMPLES
-+.TP
-+Trace work sched latency higher than 10000 us (default):
-+#
-+.B aliworkslower
-+.TP
-+Trace work sched latency higher than  1000 us:
-+#
-+.B aliworkslower 1000
-+.SH FIELDS
-+.TP
-+Worker Thread
-+PID of worker thread.
-+.TP
-+Work
-+Work callback.
-+.TP
-+Total Q2S latency
-+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us.
-+.TP
-+Q2A latency
-+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us.
-+.TP
-+A2S latency
-+Latency between workqueue_activate_work (A) and workqueue_execute_start (S), in us.
-+.SH OVERHEAD
-+This sampls at every workqueue_queue_work/workqueue_activate_work/
-+workqueue_execute_start tracepoint, and in-kernel summaries.
-+.SH SOURCE
-+This is from bcc.
-+.IP
-+https://github.com/iovisor/bcc
-+.PP
-+Also look in the bcc distribution for a companion _examples.txt file containing
-+example usage, output, and commentary for this tool.
-+.SH OS
-+Linux
-+.SH STABILITY
-+Unstable - in development.
-+.SH AUTHOR
-+Jeffle Xu
-diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py
-index 9222e37..db51793 100755
---- a/tests/python/test_tools_smoke.py
-+++ b/tests/python/test_tools_smoke.py
-@@ -421,5 +421,53 @@ class SmokeTests(TestCase):
-         # Fails to attach the probe if zfs is not installed.
-         pass
- 
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alirunqinfo(self):
-+        self.run_with_duration("alirunqinfo.py 1 1")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alihardirqs(self):
-+        self.run_with_duration("alihardirqs.py 1 1")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alisoftirqs(self):
-+        self.run_with_duration("alisoftirqs.py 1 1")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alisyscount(self):
-+        self.run_with_duration("alisyscount.py 1 1")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alisysdelay(self):
-+        self.run_with_int("alisysdelay.py", timeout=10, kill_timeout=15)
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alisyslatency(self):
-+        self.run_with_duration("alisyslatency.py 1 1")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_aliworkslower(self):
-+        self.run_with_int("aliworkslower.py", timeout=10, kill_timeout=15)
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_aliexitsnoop(self):
-+        self.run_with_int("aliexitsnoop.py ")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alimutexsnoop(self):
-+        self.run_with_int("alimutexsnoop.py ")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alibiolatency(self):
-+        self.run_with_int("alibiolatency.py ")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alijbd2stat(self):
-+        self.run_with_int("alijbd2stat.py ")
-+
-+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
-+    def test_alijbd2stat(self):
-+        self.run_with_int("aliext4writeslower.py ")
-+
- if __name__ == "__main__":
-     main()
-diff --git a/tools/alibiolatency.py b/tools/alibiolatency.py
-new file mode 100755
-index 0000000..07f98d3
---- /dev/null
-+++ b/tools/alibiolatency.py
-@@ -0,0 +1,532 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alibiolatency    Summarize block device I/O latency.
-+#       For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alibiolatency [-h] [-i interval] [-d device] [-t avg_req_time] [-T req_time] [-r]
-+#
-+# Copyright (c) 2019-2021 Alibaba Group.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+#
-+# 2019/07/01 Xiaoguang Wang Created this.
-+
-+from __future__ import print_function
-+from bcc import BPF
-+import ctypes as ct
-+import argparse
-+import time
-+
-+# arguments
-+examples = """examples:
-+    ./alibiolatency          # summarize block I/O latency(default display interval is 2s)
-+    ./alibiolatency -d sda3  # inspect specified device /dev/sda3
-+    ./alibiolatency -i 2     # specify display interval, 2s
-+    ./alibiolatency -t 10    # display only when average request process time is greater than 10ms
-+    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
-+    ./alibiolatency -r       # dump every io request life cycle
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize block device I/O latency",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-d", "--device", help="inspect specified device")
-+parser.add_argument("-i", "--dis_interval", nargs="?", default=2,
-+    help="specify display interval")
-+parser.add_argument("-t", "--avg_threshold_time", nargs="?", default=0,
-+    help="display only when average request process time is greater than this value")
-+parser.add_argument("-T", "--threshold_time", nargs="?", default=999999999,
-+    help="dump request life cycle when single request process time is greater than this value")
-+parser.add_argument("-r", "--dump_raw", action="store_true", help="dump every io request life cycle")
-+parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
-+args = parser.parse_args()
-+threshold_time = int(args.threshold_time)
-+avg_threshold_time = int(args.avg_threshold_time)
-+dis_interval = int(args.dis_interval)
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+#include <linux/blkdev.h>
-+
-+struct req_stat {
-+	dev_t dev;
-+	u64 queue_us;
-+	u64 getrq_us;
-+	u64 insert_us;
-+	u64 issue_us;
-+	u64 completion_us;
-+	u64 sector;
-+	u32 num_sectors;
-+	char rwbs[8];
-+};
-+
-+typedef struct disk_key {
-+	dev_t dev;
-+	u64 sector;
-+} disk_key_t;
-+
-+BPF_HASH(io_requests, disk_key_t, struct req_stat);
-+BPF_PERF_OUTPUT(req_stat);
-+
-+
-+TRACEPOINT_PROBE(block, block_bio_queue)
-+{
-+	disk_key_t key = {};
-+	struct req_stat s = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	s.dev = key.dev;
-+	s.queue_us = bpf_ktime_get_ns() / 1000;
-+	bpf_probe_read(s.rwbs, 8, args->rwbs);
-+	io_requests.update(&key, &s);
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_getrq)
-+{
-+	struct req_stat *r;
-+	disk_key_t key = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	r = io_requests.lookup(&key);
-+	if (r == NULL)
-+		return 0;
-+
-+	r->getrq_us = bpf_ktime_get_ns() / 1000;
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_bio_backmerge)
-+{
-+	struct req_stat *r;
-+	disk_key_t key = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	io_requests.delete(&key);
-+
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_bio_frontmerge)
-+{
-+	struct req_stat *r;
-+	struct req_stat *r2;
-+	disk_key_t key = {};
-+	disk_key_t key2 = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector + args->nr_sector;
-+	r = io_requests.lookup(&key);
-+	if (r == NULL)
-+		return 0;
-+
-+	key2.dev = args->dev;
-+	key2.sector = args->sector;
-+	r2 = io_requests.lookup(&key2);
-+	if (r2 == NULL) {
-+		io_requests.delete(&key);
-+		return 0;
-+	}
-+
-+	// keep old value.
-+	r2->queue_us = r->queue_us;
-+	r2->getrq_us = r->getrq_us;
-+	r2->insert_us = r->insert_us;
-+	r2->issue_us = r->issue_us;
-+	io_requests.delete(&key);
-+
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_rq_insert)
-+{
-+	struct req_stat *r;
-+	disk_key_t key = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	r = io_requests.lookup(&key);
-+	if (r == NULL)
-+		return 0;
-+
-+	r->insert_us = bpf_ktime_get_ns() / 1000;
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_rq_issue)
-+{
-+	struct req_stat *r;
-+	disk_key_t key = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	r = io_requests.lookup(&key);
-+	if (r == NULL)
-+		return 0;
-+
-+	r->issue_us = bpf_ktime_get_ns() / 1000;
-+	r->num_sectors = args->nr_sector;
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(block, block_rq_complete)
-+{
-+	struct req_stat *r;
-+	struct req_stat val = {};
-+	disk_key_t key = {};
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = args->dev;
-+	key.sector = args->sector;
-+	r = io_requests.lookup(&key);
-+	if (r == NULL)
-+		return 0;
-+
-+	/* could be happen.
-+	if (r->num_sectors != args->nr_sector)
-+		bpf_trace_printk("here");
-+	*/
-+	r->completion_us = bpf_ktime_get_ns() / 1000;
-+	/* FIXME: should we use r? */
-+	//val = *r;
-+	val.dev = r->dev;
-+	val.queue_us = r->queue_us;
-+	val.getrq_us = r->getrq_us;
-+	val.insert_us = r->insert_us;
-+	val.issue_us = r->issue_us;
-+	val.completion_us = r->completion_us;
-+	val.sector = args->sector;
-+	val.num_sectors = args->nr_sector;
-+	memcpy(val.rwbs, r->rwbs, 8);
-+
-+	io_requests.delete(&key);
-+
-+	req_stat.perf_submit(args, &val, sizeof(struct req_stat));
-+	return 0;
-+}
-+
-+int trace_elv_merge_requests(struct pt_regs *ctx, struct request_queue *q,
-+		struct request *rq, struct request *next)
-+{
-+	disk_key_t key = {};
-+	dev_t dev = (next->rq_disk)->part0.__dev.devt;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	key.dev = dev;
-+	key.sector = next->__sector;
-+	io_requests.delete(&key);
-+
-+	return 0;
-+}
-+
-+"""
-+
-+dev_name={}
-+
-+def init_dev_name():
-+        global dev_name
-+
-+        f = open("/proc/partitions")
-+        for line in f.readlines():
-+                line = line.strip()
-+                if not len(line):
-+                        continue;
-+
-+                str = line.split()
-+                if str[0].isalpha():
-+                        continue
-+                dev_num = (int(str[0]) << 20) + int(str[1])
-+                dev_name[dev_num] = str[3]
-+        f.close()
-+
-+init_dev_name()
-+name2devid={v:k for k,v in dev_name.items()}
-+
-+if args.device:
-+	bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
-+else:
-+	bpf_text = bpf_text.replace('FILTER_DEV', '0')
-+
-+if args.dump_raw:
-+	dump_raw = 1
-+else:
-+	dump_raw = 0
-+
-+# code substitutions
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+b.attach_kprobe(event="elv_merge_requests", fn_name="trace_elv_merge_requests")
-+
-+print("Tracing block device I/O... Hit Ctrl-C to end.")
-+
-+class Data(ct.Structure):
-+	_fields_ = [
-+		("dev", ct.c_uint),
-+		("queue_us", ct.c_ulonglong),
-+		("getrq_us", ct.c_ulonglong),
-+		("insert_us", ct.c_ulonglong),
-+		("issue_us", ct.c_ulonglong),
-+		("completion_us", ct.c_ulonglong),
-+		("sector", ct.c_ulonglong),
-+		("num_sectors", ct.c_uint),
-+		("rwbs", ct.c_char * 8)
-+    ]
-+
-+io_stats={}
-+last_dis_time=0
-+
-+class req_stats:
-+	def __init__(self, dev):
-+		self.dev=dev
-+		self.sectors=[0, 0, 0]
-+		self.num_reqs=[0, 0, 0]
-+		self.lat=[0, 0, 0]
-+		self.lat_max=[0, 0, 0]
-+		self.lat_min=[99999999, 99999999, 99999999]
-+		self.i2d_lat=[0, 0, 0]
-+		self.i2d_lat_max=[0, 0, 0]
-+		self.i2d_lat_min=[99999999, 99999999, 99999999]
-+		self.d2c_lat=[0, 0, 0]
-+		self.d2c_lat_max=[0,0,0]
-+		self.d2c_lat_min=[99999999, 99999999, 99999999]
-+
-+	def update_req_stats(self, io_type, num_sectors, lat, i2d_lat, d2c_lat):
-+		self.num_reqs[io_type] += 1
-+		self.sectors[io_type] += num_sectors
-+		self.lat[io_type] += lat
-+		self.i2d_lat[io_type] += i2d_lat
-+		self.d2c_lat[io_type] += d2c_lat
-+
-+		if lat > self.lat_max[io_type]:
-+			self.lat_max[io_type] = lat;
-+
-+		if lat < self.lat_min[io_type]:
-+			self.lat_min[io_type] = lat;
-+
-+		if i2d_lat > self.i2d_lat_max[io_type]:
-+			self.i2d_lat_max[io_type] = i2d_lat
-+
-+		if i2d_lat < self.i2d_lat_min[io_type]:
-+			self.i2d_lat_min[io_type] = i2d_lat
-+
-+		if d2c_lat > self.d2c_lat_max[io_type]:
-+			self.d2c_lat_max[io_type] = d2c_lat
-+
-+		if d2c_lat < self.d2c_lat_min[io_type]:
-+			self.d2c_lat_min[io_type] = d2c_lat
-+
-+	def display_req_stats(self, time_interval):
-+		global avg_threshold_time
-+
-+		ti = time_interval / 1000.0 / 1000.0
-+
-+		print("dev: %s" % dev_name[self.dev])
-+		for io_type in (0, 1, 2):
-+			if self.num_reqs[io_type] == 0:
-+				continue
-+			total_reqs = self.num_reqs[io_type]
-+
-+			iops = int(total_reqs / ti)
-+			avg_lat = self.lat[io_type] / 1000.0 / total_reqs
-+			if avg_lat <  avg_threshold_time:
-+				continue;
-+
-+			speed = self.sectors[io_type] * 512 / 1024 / 1024 / ti
-+			avg_req_sz = self.sectors[io_type] * 512 / 1024 / total_reqs
-+
-+			i2d_avg = self.i2d_lat[io_type] / 1000.0 / total_reqs
-+			i2d_percent = int(i2d_avg / avg_lat * 100)
-+
-+			d2c_avg = self.d2c_lat[io_type] / 1000.0 / total_reqs
-+			d2c_percent = int(d2c_avg / avg_lat * 100)
-+
-+			if io_type == 0:
-+				io_type_str="READ"
-+			elif io_type == 1:
-+				io_type_str="WRITE"
-+			else:
-+				io_type_str="DISCARD"
-+
-+			print("  %s iops:%d avg:%.2f min:%.2f max:%.2f speed:%.2fMB/s avgrq-sz:%dKB" %
-+			      (io_type_str, iops, avg_lat, self.lat_min[io_type] / 1000.0,
-+			      self.lat_max[io_type] / 1000.0, speed, avg_req_sz))
-+
-+			print("  [i2d_avg: %.2f percent:%d min:%.2f max:%.2f" % (i2d_avg,
-+			      i2d_percent, self.i2d_lat_min[io_type] / 1000.0,
-+			      self.i2d_lat_max[io_type] / 1000.0))
-+
-+			print("  [d2c_avg: %.2f percent:%d min:%.2f max:%.2f" % (d2c_avg,
-+			      d2c_percent, self.d2c_lat_min[io_type] / 1000.0,
-+			      self.d2c_lat_max[io_type] / 1000.0))
-+
-+
-+def get_io_type(rwbs):
-+	# please see logic in kernel func blk_fill_rwbs()
-+	if rwbs[0] == 'R' or rwbs[0:2] == "FR":
-+		return 0;
-+	elif rwbs[0] == 'W' or rwbs[0:2] == "FW":
-+		return 1;
-+	return 2
-+
-+def is_exception_req(event):
-+	# we count this as a abnomal request...
-+	# Currently I still hasn't figured out in which case getrq_us
-+	# will be zero...
-+	if event.getrq_us == 0:
-+		print("dev:%s act:%s sector:%lu bytes:%u exception req Q:%lu G:%lu I:%lu D:%lu C:%lu" % (
-+		      dev_name[event.dev], event.rwbs, event.sector, event.num_sectors, event.queue_us,
-+		      event.getrq_us, event.insert_us, event.issue_us, event.completion_us))
-+
-+def dump_raw_trace_event(dev, act, ts, event):
-+	print("%s %s %lu sector:%lu bytes:%u act:%s" % (dev, act, ts / 1000,
-+	      event.sector, event.num_sectors, event.rwbs))
-+
-+def dump_timeout_req(event):
-+	dev = dev_name[event.dev]
-+	localtime = time.asctime( time.localtime(time.time()))
-+
-+	print("%s" % localtime)
-+	print("timeouted request sector:%lu bytes:%u req time:%lu" % (event.sector,
-+	      event.num_sectors, (event.completion_us - event.queue_us) / 1000))
-+	dump_raw_trace_event(dev, "Q", event.queue_us, event)
-+	dump_raw_trace_event(dev, "G", event.getrq_us, event)
-+	dump_raw_trace_event(dev, "I", event.insert_us, event)
-+	dump_raw_trace_event(dev, "D", event.issue_us, event)
-+	dump_raw_trace_event(dev, "C", event.completion_us, event)
-+
-+def dump_req_life_cycle(event):
-+	dev = dev_name[event.dev]
-+	localtime = time.asctime( time.localtime(time.time()))
-+
-+	print("%s" % localtime)
-+	print("raw request event sector:%lu bytes:%u req time:%lu" % (event.sector,
-+	      event.num_sectors, (event.completion_us - event.queue_us) / 1000))
-+	dump_raw_trace_event(dev, "Q", event.queue_us, event)
-+	dump_raw_trace_event(dev, "G", event.getrq_us, event)
-+	dump_raw_trace_event(dev, "I", event.insert_us, event)
-+	dump_raw_trace_event(dev, "D", event.issue_us, event)
-+	dump_raw_trace_event(dev, "C", event.completion_us, event)
-+
-+# process event
-+def print_event(cpu, data, size):
-+	global io_stats
-+	global last_dis_time
-+
-+	event = ct.cast(data, ct.POINTER(Data)).contents
-+
-+	is_exception_req(event)
-+
-+	if dump_raw != 0:
-+		dump_req_life_cycle(event)
-+
-+	lat = event.completion_us - event.queue_us;
-+
-+	if event.issue_us != 0:
-+		d2c_lat = event.completion_us - event.issue_us
-+	elif event.insert_us != 0:
-+		d2c_lat = event.completion_us - event.insert_us
-+	elif event.getrq_us != 0:
-+		d2c_lat = event.completion_us - event.getrq_us
-+	else:
-+		d2c_lat = lat
-+
-+	# Note: this could happen...
-+	# In kernel 4.9, for example, scsi_request_fn calls blk_peek_request()
-+	# to fetch a request, but if later scsi driver could not process this
-+	# request(see not_ready lable in scsi_request_fn), it will call
-+	# blk_requeue_request() to put back this request to block elevator, now
-+	# insert_us will be greater than issue_us.
-+	#
-+	# Later scsi_request_fn will pick this request again, but still it could
-+	# not handle this request, see codes in scsi_request_fn
-+	#	if (!scsi_dev_queue_ready(q, sdev))
-+	#		break;
-+	# then this request will still be in block elevator, but with RQF_STARTED
-+	# marked.
-+	#
-+	# Finally scsi_request_fn calls blk_peek_request() to process this request,
-+	# because RQF_STARTED is set, and this time blk_peek_request won't call
-+	# trace_block_rq_issue(), then issue_us also won't be updated.
-+	# Now insert_us is greater than issue_us.
-+	if event.issue_us <  event.insert_us:
-+		print("lege %u %s sector:%lu bytes:%u %lu %lu" % (dev_name[event.dev],
-+		      event.rwbs, event.sector, event.num_sectors, event.insert_us,
-+		      event.issue_us))
-+
-+	if event.insert_us != 0 and event.issue_us > event.insert_us:
-+		i2d_lat = event.issue_us - event.insert_us;
-+	else:
-+		i2d_lat = 0
-+	io_type = get_io_type(event.rwbs)
-+
-+	if last_dis_time == 0:
-+		last_dis_time = event.completion_us
-+
-+	if not io_stats.has_key(event.dev):
-+		io_stats[event.dev] = req_stats(event.dev)
-+
-+	if lat > threshold_time * 1000:
-+		dump_timeout_req(event)
-+
-+	io_stats[event.dev].update_req_stats(io_type, event.num_sectors, lat, i2d_lat, d2c_lat)
-+	if event.completion_us > last_dis_time and (event.completion_us - last_dis_time) > dis_interval * 1000 * 1000:
-+		time_interval = event.completion_us - last_dis_time;
-+		localtime = time.asctime( time.localtime(time.time()))
-+
-+		keys = io_stats.keys()
-+		keys.sort()
-+		print("%s" % localtime)
-+		for k in keys:
-+			io_stats[k].display_req_stats(time_interval)
-+		last_dis_time = event.completion_us
-+
-+		print("\n")
-+		io_stats.clear()
-+
-+
-+b["req_stat"].open_perf_buffer(print_event, page_cnt=64)
-+while 1:
-+    try:
-+        b.perf_buffer_poll()
-+
-+    except KeyboardInterrupt:
-+        exit()
-diff --git a/tools/alibiolatency_example.txt b/tools/alibiolatency_example.txt
-new file mode 100644
-index 0000000..70afc6d
---- /dev/null
-+++ b/tools/alibiolatency_example.txt
-@@ -0,0 +1,119 @@
-+Demonstrations of alibiolatency.
-+
-+Alibiolatency can trace every io request's life cycle and do data filter and statistics
-+accordingly. With this tool, we can clearly know io request's different stage's latency
-+proportion, for example, if io request D2C(see blktrace) time take majority in total io
-+request process time, it's most likely disk bottleneck has been reached.
-+
-+Usage:
-+alibiolatency [-h] [-d DEVICE] [-i [DIS_INTERVAL]] [-t [AVG_THRESHOLD_TIME]] [-T [THRESHOLD_TIME]] [-r]
-+
-+Summarize block device I/O latency
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -d DEVICE, --device DEVICE
-+                        inspect specified device
-+  -i [DIS_INTERVAL], --dis_interval [DIS_INTERVAL]
-+                        specify display interval
-+  -t [AVG_THRESHOLD_TIME], --avg_threshold_time [AVG_THRESHOLD_TIME]
-+                        display only when average request process time is
-+                        greater than this value
-+  -T [THRESHOLD_TIME], --threshold_time [THRESHOLD_TIME]
-+                        dump request life cycle when single request process
-+                        time is greater than this value
-+  -r, --dump_raw        dump every io request life cycle
-+
-+examples:
-+    ./alibiolatency          # summarize block I/O latency(default display interval is 2s)
-+    ./alibiolatency -d sda3  # inspect specified device /dev/sda3
-+    ./alibiolatency -i 2     # specify display interval, 2s
-+    ./alibiolatency -t 10    # display only when average request process time is greater than 10ms
-+    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
-+    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
-+    ./alibiolatency -r       # dump every io request life cycle
-+
-+
-+Output example:
-+
-+Example 1:
-+# Summarize block device I/O latency periodically.
-+python alibiolatency.py
-+
-+Wed Aug 28 19:13:00 2019
-+dev: sda
-+  READ iops:32 avg:15.36 min:0.02 max:143.51 speed:0.00MB/s avgrq-sz:4KB
-+  [i2d_avg: 0.00 percent:0 min:0.00 max:0.01
-+  [d2c_avg: 15.26 percent:99 min:0.02 max:143.51
-+  WRITE iops:2 avg:0.06 min:0.04 max:0.09 speed:0.00MB/s avgrq-sz:24KB
-+  [i2d_avg: 0.00 percent:2 min:0.00 max:0.00
-+  [d2c_avg: 0.05 percent:76 min:0.03 max:0.06
-+
-+
-+Wed Aug 28 19:13:03 2019
-+dev: sda
-+  READ iops:62 avg:2.03 min:0.03 max:16.08 speed:0.68MB/s avgrq-sz:14KB
-+  [i2d_avg: 0.00 percent:0 min:0.00 max:0.00
-+  [d2c_avg: 1.99 percent:97 min:0.03 max:16.07
-+  WRITE iops:6 avg:0.06 min:0.03 max:0.12 speed:0.00MB/s avgrq-sz:10KB
-+  [i2d_avg: 0.00 percent:5 min:0.00 max:0.01
-+  [d2c_avg: 0.04 percent:76 min:0.03 max:0.07
-+
-+Example 2:
-+# Summarize block device I/O latency periodically, and will print io request's
-+# complete life cycle when this io request's process time is greater than 20ms.
-+python alibiolatency.py  -T 20 # Summarize block device I/O latency periodically.
-+
-+timeouted request sector:269555712 bytes:1024 req time:379
-+sda Q 3102021522 sector:269555712 bytes:1024 act:W
-+sda G 3102021522 sector:269555712 bytes:1024 act:W
-+sda I 3102021522 sector:269555712 bytes:1024 act:W
-+sda D 3102021522 sector:269555712 bytes:1024 act:W
-+sda C 3102021902 sector:269555712 bytes:1024 act:W
-+Wed Aug 28 19:28:44 2019
-+timeouted request sector:269556736 bytes:1024 req time:381
-+sda Q 3102021522 sector:269556736 bytes:1024 act:W
-+sda G 3102021522 sector:269556736 bytes:1024 act:W
-+sda I 3102021522 sector:269556736 bytes:1024 act:W
-+sda D 3102021522 sector:269556736 bytes:1024 act:W
-+sda C 3102021904 sector:269556736 bytes:1024 act:W
-+Wed Aug 28 19:28:45 2019
-+dev: sda
-+  READ iops:0 avg:0.07 min:0.07 max:0.07 speed:0.00MB/s avgrq-sz:4KB
-+  [i2d_avg: 0.01 percent:17 min:0.01 max:0.01
-+  [d2c_avg: 0.05 percent:73 min:0.05 max:0.05
-+  WRITE iops:173 avg:339.79 min:0.04 max:501.72 speed:69.09MB/s avgrq-sz:408KB
-+  [i2d_avg: 0.02 percent:0 min:0.00 max:0.06
-+  [d2c_avg: 339.16 percent:99 min:0.04 max:501.67
-+
-+Example 3:
-+# Summarize block device I/O latency periodically, and will print every io
-+# request's complete life cycle.
-+python alibiolatency.py  -r
-+
-+Wed Aug 28 19:36:40 2019
-+raw request event sector:52903080 bytes:8 req time:0
-+sda Q 3102498371 sector:52903080 bytes:8 act:WS
-+sda G 3102498371 sector:52903080 bytes:8 act:WS
-+sda I 3102498371 sector:52903080 bytes:8 act:WS
-+sda D 3102498371 sector:52903080 bytes:8 act:WS
-+sda C 3102498371 sector:52903080 bytes:8 act:WS
-+Wed Aug 28 19:36:40 2019
-+raw request event sector:2157826760 bytes:8 req time:0
-+sda Q 3102498785 sector:2157826760 bytes:8 act:W
-+sda G 3102498785 sector:2157826760 bytes:8 act:W
-+sda I 3102498785 sector:2157826760 bytes:8 act:W
-+sda D 3102498785 sector:2157826760 bytes:8 act:W
-+sda C 3102498785 sector:2157826760 bytes:8 act:W
-+Wed Aug 28 19:36:42 2019
-+raw request event sector:208390056 bytes:8 req time:0
-+sda Q 3102500593 sector:208390056 bytes:8 act:W
-+sda G 3102500593 sector:208390056 bytes:8 act:W
-+sda I 3102500593 sector:208390056 bytes:8 act:W
-+sda D 3102500593 sector:208390056 bytes:8 act:W
-+sda C 3102500593 sector:208390056 bytes:8 act:W
-+Wed Aug 28 19:36:42 2019
-+dev: sda
-+  WRITE iops:42 avg:0.13 min:0.04 max:0.24 speed:0.00MB/s avgrq-sz:5KB
-+  [i2d_avg: 0.01 percent:8 min:0.00 max:0.03
-+  [d2c_avg: 0.06 percent:48 min:0.03 max:0.10
-diff --git a/tools/aliexitsnoop.py b/tools/aliexitsnoop.py
-new file mode 100755
-index 0000000..4f0b1a1
---- /dev/null
-+++ b/tools/aliexitsnoop.py
-@@ -0,0 +1,302 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+from __future__ import print_function
-+
-+import argparse
-+import ctypes as ct
-+import os
-+import platform
-+import re
-+import signal
-+import sys
-+
-+from bcc import BPF
-+from bcc.utils import printb
-+from datetime import datetime
-+from time import strftime
-+
-+#
-+# aliexitsnoop Trace all process termination (exit, fatal signal)
-+#           For Linux, uses BCC, eBPF. Embedded C.
-+#
-+# USAGE: aliexitsnoop [-h] [-x] [-t] [--utc] [--label[=LABEL]] [-p PID]
-+#
-+_examples = """examples:
-+    aliexitsnoop                # trace all process termination
-+    aliexitsnoop -x             # trace only fails, exclude exit(0)
-+    aliexitsnoop -t             # include timestamps (local time)
-+    aliexitsnoop --utc          # include timestamps (UTC)
-+    aliexitsnoop -p 181         # only trace PID 181
-+    aliexitsnoop --label=exit   # label each output line with 'exit'
-+"""
-+"""
-+  Exit status (from <include/sysexits.h>):
-+
-+    0 EX_OK        Success
-+    2              argparse error
-+   70 EX_SOFTWARE  syntax error detected by compiler, or
-+                   verifier error from kernel
-+   77 EX_NOPERM    Need sudo (CAP_SYS_ADMIN) for BPF() system call
-+
-+  More information about this script is in bcc/tools/aliexitsnoop_example.txt
-+
-+  Copyright 2016 Netflix, Inc.
-+  Copyright 2019 Instana, Inc.
-+  Copyright 2019 Alibaba, Inc.
-+  Licensed under the Apache License, Version 2.0 (the "License")
-+
-+  07-Feb-2016   Brendan Gregg (Netflix)            Created execsnoop
-+  04-May-2019   Arturo Martin-de-Nicolas (Instana) Created exitsnoop
-+  13-May-2019   Jeroen Soeters (Instana) Refactor to import as module
-+  16-Jul-2019   Alvin Zheng (Alibaba) Created aliexitsnoop
-+"""
-+
-+def _getParser():
-+    parser = argparse.ArgumentParser(
-+        description="Trace all process termination (exit, fatal signal)",
-+        formatter_class=argparse.RawDescriptionHelpFormatter,
-+        epilog=_examples)
-+    a = parser.add_argument
-+    a("-t", "--timestamp", action="store_true", help="include timestamp (local time default)")
-+    a("--utc", action="store_true", help="include timestamp in UTC (-t implied)")
-+    a("-p", "--pid", help="trace this PID only")
-+    a("--label", help="label each line")
-+    a("-x", "--failed", action="store_true", help="trace only fails, exclude exit(0)")
-+    # print the embedded C program and exit, for debugging
-+    a("--ebpf", action="store_true", help=argparse.SUPPRESS)
-+    # RHEL 7.6 keeps task->start_time as struct timespec, convert to u64 nanoseconds
-+    a("--timespec", action="store_true", help=argparse.SUPPRESS)
-+    return parser.parse_args
-+
-+
-+class Global():
-+    parse_args = _getParser()
-+    args = None
-+    argv = None
-+    bpf = None
-+    SIGNUM_TO_SIGNAME = dict((v, re.sub("^SIG", "", k))
-+        for k, v in signal.__dict__.items() if re.match("^SIG[A-Z]+$", k))
-+
-+
-+class Data(ct.Structure):
-+    """Event data matching struct data_t in _embedded_c()."""
-+    _TASK_COMM_LEN = 16      # linux/sched.h
-+    _pack_ = 1
-+    _fields_ = [
-+        ("start_time", ct.c_ulonglong),  # task->start_time, see --timespec arg
-+        ("exit_time", ct.c_ulonglong),  # bpf_ktime_get_ns()
-+        ("pid_tgid", ct.c_ulonglong),
-+        ("pid", ct.c_uint),  # task->tgid, thread group id == sys_getpid()
-+        ("tid", ct.c_uint),  # task->pid, thread id == sys_gettid()
-+        ("ppid", ct.c_uint),  # task->parent->tgid, notified of exit
-+        ("exit_code", ct.c_int),
-+        ("sig_info", ct.c_uint),
-+        ("k_stack_id", ct.c_int),
-+        ("u_stack_id", ct.c_int),
-+        ("task", ct.c_char * _TASK_COMM_LEN)
-+    ]
-+
-+def _embedded_c(args):
-+    """Generate C program for sched_process_exit tracepoint in kernel/exit.c."""
-+    c = """
-+    EBPF_COMMENT
-+    #include <linux/sched.h>
-+    BPF_STATIC_ASSERT_DEF
-+    BPF_STACK_TRACE(stack_traces,16384);
-+    struct data_t {
-+        u64 start_time;
-+        u64 exit_time;
-+        u64 pid_tgid;
-+        u32 pid;
-+        u32 tid;
-+        u32 ppid;
-+        int exit_code;
-+        u32 sig_info;
-+        int k_stack_id;
-+        int u_stack_id;
-+        char task[TASK_COMM_LEN];
-+    } __attribute__((packed));
-+
-+    BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA);
-+    BPF_PERF_OUTPUT(events);
-+
-+    int kprobe_do_exit(struct pt_regs * ctx, long exit_code)
-+    {
-+        struct task_struct *task = (typeof(task))bpf_get_current_task();
-+        if (FILTER_PID || FILTER_EXIT_CODE) { return 0; }
-+
-+        struct data_t data = {
-+            .start_time = PROCESS_START_TIME_NS,
-+            .exit_time = bpf_ktime_get_ns(),
-+            .pid_tgid=bpf_get_current_pid_tgid(),
-+            .pid = task->tgid,
-+            .tid = task->pid,
-+            .ppid = task->parent->tgid,
-+            .exit_code = task->exit_code >> 8,
-+            .sig_info = task->exit_code & 0xFF,
-+        };
-+        bpf_get_current_comm(&data.task, sizeof(data.task));
-+        data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID);
-+        data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_USER_STACK|BPF_F_REUSE_STACKID);
-+        events.perf_submit(ctx, &data, sizeof(data));
-+        return 0;
-+    }
-+    """
-+    # TODO: this macro belongs in bcc/src/cc/export/helpers.h
-+    bpf_static_assert_def = r"""
-+    #ifndef BPF_STATIC_ASSERT
-+    #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \
-+    extern int bpf_static_assert[(condition) ? 1 : -1]
-+    #endif
-+    """
-+    code_substitutions = [
-+        ('EBPF_COMMENT', '' if not Global.args.ebpf else _ebpf_comment()),
-+        ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def),
-+        ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))),
-+        ('FILTER_PID', '0' if not Global.args.pid else "task->tgid != %s" % Global.args.pid),
-+        ('FILTER_EXIT_CODE', '0' if not Global.args.failed else 'task->exit_code == 0'),
-+        ('PROCESS_START_TIME_NS', 'task->start_time' if not Global.args.timespec else
-+             '(task->start_time.tv_sec * 1000000000L) + task->start_time.tv_nsec'),
-+    ]
-+    for old, new in code_substitutions:
-+        c = c.replace(old, new)
-+    return c
-+
-+def _ebpf_comment():
-+    """Return a C-style comment with information about the generated code."""
-+    comment = ('Created by %s at %s:\n\t%s' %
-+                    (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__))
-+    args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n')
-+    return ("\n   /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" %
-+                             (comment, ' '.join(Global.argv), args))
-+                   .replace('\n', '\n\t*').replace('\t', '    '))
-+
-+def _print_header():
-+    if Global.args.timestamp:
-+        title = 'TIME-' + ('UTC' if Global.args.utc else strftime("%Z"))
-+        print("%-13s" % title, end="")
-+    if Global.args.label is not None:
-+        print("%-6s" % "LABEL", end="")
-+    print("%-16s %-6s %-6s %-6s %-7s %-10s" %
-+              ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE"))
-+
-+def _print_event(cpu, data, size):  # callback
-+    """Print the exit event."""
-+    stack_traces = Global.bpf['stack_traces']
-+    e = ct.cast(data, ct.POINTER(Data)).contents
-+    if Global.args.timestamp:
-+        now = datetime.utcnow() if Global.args.utc else datetime.now()
-+        print("%-13s" % (now.strftime("%H:%M:%S.%f")[:-3]), end="")
-+    if Global.args.label is not None:
-+        label = Global.args.label if len(Global.args.label) else 'exit'
-+        print("%-6s" % label, end="")
-+    age = (e.exit_time - e.start_time) / 1e9
-+    print("%-16s %-6d %-6d %-6d %-7.2f " %
-+              (e.task.decode(), e.pid, e.ppid, e.tid, age), end="")
-+    if e.sig_info == 0:
-+        print("0" if e.exit_code == 0 else "code %d" % e.exit_code)
-+    else:
-+        sig = e.sig_info & 0x7F
-+        if sig:
-+            print("signal %d (%s)" % (sig, signum_to_signame(sig)), end="")
-+        if e.sig_info & 0x80:
-+            print(", core dumped ", end="")
-+        print()
-+    # print the kernel stack
-+    if e.k_stack_id >= 0:
-+        cur_stack = reversed(list(stack_traces.walk(e.k_stack_id)))
-+        for addr in cur_stack:
-+            printb(b"Kernel Stack:    %-16x  %s" % (addr, Global.bpf.ksym(addr)))
-+    else:
-+        print("No KERNEL STACK FOUND")
-+    # print the user stack
-+    if e.u_stack_id >= 0:
-+        cur_stack = reversed(list(stack_traces.walk(e.u_stack_id)))
-+        for addr in cur_stack:
-+            printb(b"USER Stack:      %-16x  %s" % (addr, Global.bpf.sym(addr, e.pid_tgid)))
-+    else:
-+        print("No USER STACK FOUND")
-+    print("=================================")
-+    print()
-+# =============================
-+# Module: These functions are available for import
-+# =============================
-+def initialize(arg_list=sys.argv[1:]):
-+    """Trace all process termination.
-+
-+    arg_list - list of args, if omitted then uses command line args
-+               arg_list is passed to argparse.ArgumentParser.parse_args()
-+
-+    For example, if arg_list = [ '-x', '-t' ]
-+       args.failed == True
-+       args.timestamp == True
-+
-+    Returns a tuple (return_code, result)
-+       0 = Ok, result is the return value from BPF()
-+       1 = args.ebpf is requested, result is the generated C code
-+       os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message
-+       os.EX_SOFTWARE: internal software error, result is error message
-+    """
-+    Global.argv = arg_list
-+    Global.args = Global.parse_args(arg_list)
-+    if Global.args.utc and not Global.args.timestamp:
-+        Global.args.timestamp = True
-+    if not Global.args.ebpf and os.geteuid() != 0:
-+        return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call")
-+    if re.match('^3\.10\..*el7.*$', platform.release()):  # Centos/Red Hat
-+        Global.args.timespec = True
-+    for _ in range(2):
-+        c = _embedded_c(Global.args)
-+        if Global.args.ebpf:
-+            return (1, c)
-+        try:
-+            return (os.EX_OK, BPF(text=c))
-+        except Exception as e:
-+            error = format(e)
-+            if (not Global.args.timespec and error.find('struct timespec') and error.find('start_time')):
-+                print('This kernel keeps task->start_time in a struct timespec.\n' +
-+                          'Retrying with --timespec')
-+                Global.args.timespec = True
-+                continue
-+            return (os.EX_SOFTWARE, "BPF error: " + error)
-+        except:
-+            return (os.EX_SOFTWARE, "Unexpected error: {0}".format(sys.exc_info()[0]))
-+
-+def snoop(bpf, event_handler):
-+    """Call event_handler for process termination events.
-+
-+    bpf - result returned by successful initialize()
-+    event_handler - callback function to handle termination event
-+    args.pid - Return after event_handler is called, only monitoring this pid
-+    """
-+    bpf["events"].open_perf_buffer(event_handler)
-+    while True:
-+        bpf.perf_buffer_poll()
-+        if Global.args.pid:
-+            return
-+
-+def signum_to_signame(signum):
-+    """Return the name of the signal corresponding to signum."""
-+    return Global.SIGNUM_TO_SIGNAME.get(signum, "unknown")
-+
-+# =============================
-+# Script: invoked as a script
-+# =============================
-+def main():
-+    try:
-+        rc, buffer = initialize()
-+        Global.bpf = buffer
-+        Global.bpf.attach_kprobe(event="do_exit", fn_name="kprobe_do_exit")
-+        if rc:
-+            print(buffer)
-+            sys.exit(0 if Global.args.ebpf else rc)
-+        _print_header()
-+        snoop(buffer, _print_event)
-+    except KeyboardInterrupt:
-+        print()
-+        sys.exit()
-+
-+    return 0
-+
-+if __name__ == '__main__':
-+    main()
-diff --git a/tools/aliexitsnoop_example.txt b/tools/aliexitsnoop_example.txt
-new file mode 100644
-index 0000000..4b60929
---- /dev/null
-+++ b/tools/aliexitsnoop_example.txt
-@@ -0,0 +1,78 @@
-+Demonstrations of aliexitsnoop.
-+
-+This Linux tool is actually an enhanced version of the exitsnoop. For more details about
-+exitsnoop, see exitsnoop_example.txt. Compared to the exitsnoop, aliexitsnoop not only print
-+the exit code, pid, ppid, lifetime of the exiting processes, but also the stack of the exiting
-+processes. The demand stems from the fact that in the production environment, some processes
-+end up unexpectedly. Therefore we need to dump the stack of the target processes to find out
-+what caused the target process to exit.
-+
-+Usage:
-+aliexitsnoop.py [-h] [-t] [--utc] [-p PID] [--label LABEL] [-x]
-+
-+Trace all process termination (exit, fatal signal)
-+
-+optional arguments:
-+  -h, --help         show this help message and exit
-+  -t, --timestamp    include timestamp (local time default)
-+  --utc              include timestamp in UTC (-t implied)
-+  -p PID, --pid PID  trace this PID only
-+  --label LABEL      label each line
-+  -x, --failed       trace only fails, exclude exit(0)
-+
-+examples:
-+    aliexitsnoop                # trace all process termination
-+    aliexitsnoop -x             # trace only fails, exclude exit(0)
-+    aliexitsnoop -t             # include timestamps (local time)
-+    aliexitsnoop --utc          # include timestamps (UTC)
-+    aliexitsnoop -p 181         # only trace PID 181
-+    aliexitsnoop --label=exit   # label each output line with 'exit'
-+
-+The following example shows the output example when run in a loop that is interrupted with
-+Ctrl-C from the terminal:
-+
-+grep             25099  25087  25099  0.00    0
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
-+Kernel Stack:    ffffffffa5083ad9  do_group_exit
-+Kernel Stack:    ffffffffa5082e11  do_exit
-+USER Stack:      c74864108b48      [unknown]
-+USER Stack:      7fd29b10c660      [unknown]
-+USER Stack:      7fd29b0611d9      [unknown]
-+=================================
-+
-+id               25101  25100  25101  0.00    0
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
-+Kernel Stack:    ffffffffa5083ad9  do_group_exit
-+Kernel Stack:    ffffffffa5082e11  do_exit
-+USER Stack:      c74864108b48      [unknown]
-+USER Stack:      7f932c07b660      [unknown]
-+USER Stack:      7f932bfd01d9      [unknown]
-+=================================
-+
-+bash             25100  25087  25100  0.00    0
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
-+Kernel Stack:    ffffffffa5083ad9  do_group_exit
-+Kernel Stack:    ffffffffa5082e11  do_exit
-+USER Stack:      c74864108b48      [unknown]
-+USER Stack:      7f5318862660      [unknown]
-+USER Stack:      7f53187b71d9      [unknown]
-+=================================
-+
-+systemd          1      0      25102  0.00    0
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa5083a97  __x64_sys_exit
-+Kernel Stack:    ffffffffa5082e11  do_exit
-+USER Stack:      7f92232c1e64      [unknown]
-+
-+We can see that aliexitsnoop prints the user stack and kernel stack of the exiting processes.
-+What needs to be noted is that aliexitsnoop can only get the address track of the user stack.
-+The reason why aliexitsnoop cannot get the symbols corresponding the addresses is that the
-+user space of the process does not exists after the process ends. However, the address track
-+of the user stack still helps, because you can analyze together with /proc/$pid/maps.
-diff --git a/tools/aliext4writeslower.py b/tools/aliext4writeslower.py
-new file mode 100755
-index 0000000..db626c7
---- /dev/null
-+++ b/tools/aliext4writeslower.py
-@@ -0,0 +1,411 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# aliext4writeslower  Trace slow ext4 write operation.
-+#             For Linux, uses BCC, eBPF.
-+#
-+# USAGE: aliext4writeslower [-h] [-p PID] [-d device] [min_ms]
-+#
-+# This script traces key steps of write(2) which maybe time-consuming,
-+# here trace 4 key steps: jbd2 handle start(it may need jbd2 trancation
-+# checkpoint or commit), page cache allocation, block allocation and
-+# balance dirty pages, and we also take into account sched delay.
-+#
-+# With this tool, we can clearly know write(2)'s latency proportion.
-+#
-+# WARNING: This adds low-overhead instrumentation to these ext4 operations,
-+# including reads and writes from the file system cache. Such reads and writes
-+# can be very frequent (depending on the workload; eg, 1M/sec), at which
-+# point the overhead of this tool (even if it prints no "slower" events) can
-+# begin to become significant.
-+#
-+# By default, a minimum millisecond threshold of 10 is used.
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+#
-+# 2019-08-29   Xiaoguang Wang   Created this.
-+
-+from __future__ import print_function
-+from bcc import BPF
-+import argparse
-+from time import strftime
-+import ctypes as ct
-+
-+# symbols
-+kallsyms = "/proc/kallsyms"
-+
-+# arguments
-+examples = """examples:
-+    ./aliext4writeslower             # trace write(2) slower than 10 ms (default)
-+    ./aliext4writeslower 1           # trace write(2) slower than 1 ms
-+    ./aliext4writeslower 0           # trace all operations (warning: verbose)
-+    ./aliext4writeslower -d sda5     # inspect specified device /dev/sda5 containing ext4 fs
-+    ./aliext4writeslower -p 185      # trace PID 185 only
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Trace common ext4 write(2) slower than a threshold",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-p", "--pid", help="trace this PID only")
-+parser.add_argument("-d", "--device", help="inspect specified device")
-+parser.add_argument("min_ms", nargs="?", default='10',
-+    help="minimum I/O duration to trace, in ms (default 10)")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+args = parser.parse_args()
-+min_ms = int(args.min_ms)
-+pid = args.pid
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+#include <linux/sched.h>
-+#include <linux/blkdev.h>
-+
-+enum {
-+	TRACE_PAGECACHE_ALLOC = 0,
-+	TRACE_JOURNAL_START,
-+	TRACE_BLOCK_ALLOC,
-+	TRACE_BALANCE_DIRTY,
-+	TRACE_WRITE_BEGIN,
-+	TRACE_WRITE_END,
-+	TRACE_TYPE_MAX,
-+};
-+
-+struct val_t {
-+	dev_t dev;
-+	u64 ts;
-+	u64 sched_delay;
-+	u64 trace_ts[TRACE_TYPE_MAX];
-+	u64 trace_time[TRACE_TYPE_MAX];
-+	u64 offset;
-+	struct file *fp;
-+};
-+
-+struct data_t {
-+	dev_t dev;
-+	u64 size;
-+	u64 offset;
-+	u64 delta_us;
-+	u64 pagecache_alloc_us;
-+	u64 journal_start_us;
-+	u64 block_alloc_us;
-+	u64 balance_dirty_us;
-+	u64 write_begin_us;
-+	u64 write_end_us;
-+	u64 sched_delay;
-+	u64 pid;
-+	char task[TASK_COMM_LEN];
-+	char file[DNAME_INLINE_LEN];
-+};
-+
-+BPF_HASH(entryinfo, u64, struct val_t);
-+BPF_PERF_OUTPUT(events);
-+
-+
-+// ext4_file_write_iter():
-+int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
-+{
-+	struct val_t val = {};
-+	u64 id = bpf_get_current_pid_tgid();
-+	u32 pid = id >> 32; // PID is higher part
-+	struct task_struct *p = (struct task_struct *)bpf_get_current_task();
-+	struct inode *inode = iocb->ki_filp->f_inode;
-+	dev_t dev = inode->i_sb->s_bdev->bd_dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	if (FILTER_PID)
-+		return 0;
-+
-+	val.ts = bpf_ktime_get_ns();
-+	val.sched_delay = p->sched_info.run_delay;
-+	val.fp = iocb->ki_filp;
-+	val.dev = dev;
-+	val.offset = iocb->ki_pos;
-+	if (val.fp)
-+		entryinfo.update(&id, &val);
-+
-+	return 0;
-+}
-+
-+int trace_write_return(struct pt_regs *ctx)
-+{
-+	struct val_t *valp;
-+	u64 ts, delta_us, id = bpf_get_current_pid_tgid();
-+	u32 size, pid = id >> 32;
-+	struct task_struct *p = (struct task_struct *)bpf_get_current_task();
-+	struct data_t data = {};
-+
-+	if (FILTER_PID)
-+		return 0;
-+
-+	valp = entryinfo.lookup(&id);
-+	if (valp == 0)
-+		return 0;
-+
-+	ts = bpf_ktime_get_ns();
-+	delta_us = (ts - valp->ts) / 1000;
-+	entryinfo.delete(&id);
-+	if (FILTER_US)
-+		return 0;
-+
-+	size = PT_REGS_RC(ctx);
-+	data.size = size;
-+	data.delta_us = delta_us;
-+	data.pid = pid;
-+	data.sched_delay = (p->sched_info.run_delay - valp->sched_delay) / 1000;
-+	data.offset = valp->offset;
-+	data.dev = valp->dev;
-+	data.pagecache_alloc_us = valp->trace_time[TRACE_PAGECACHE_ALLOC] / 1000;
-+	data.journal_start_us = valp->trace_time[TRACE_JOURNAL_START] / 1000;
-+	data.block_alloc_us = valp->trace_time[TRACE_BLOCK_ALLOC] / 1000;
-+	data.balance_dirty_us = valp->trace_time[TRACE_BALANCE_DIRTY] / 1000;
-+	data.write_begin_us = valp->trace_time[TRACE_WRITE_BEGIN] / 1000;
-+	data.write_end_us = valp->trace_time[TRACE_WRITE_END] / 1000;
-+	bpf_get_current_comm(&data.task, sizeof(data.task));
-+
-+	// workaround (rewriter should handle file to d_name in one step):
-+	struct dentry *de = NULL;
-+	struct qstr qs = {};
-+	bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry);
-+	bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name);
-+	if (qs.len == 0)
-+		return 0;
-+	bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
-+
-+	// output
-+	events.perf_submit(ctx, &data, sizeof(data));
-+
-+	return 0;
-+}
-+
-+
-+static int trace_type_entry(struct pt_regs *ctx, int type)
-+{
-+    struct val_t *valp;
-+    u64 id = bpf_get_current_pid_tgid();
-+    u32 pid = id >> 32; // PID is higher part
-+    u64 ts = bpf_ktime_get_ns();
-+
-+    valp = entryinfo.lookup(&id);
-+    if (valp == 0) {
-+        // missed tracing issue or filtered
-+        return 0;
-+    }
-+
-+    valp->trace_ts[type] = ts;
-+    return 0;
-+}
-+
-+static int trace_type_return(struct pt_regs *ctx, int type)
-+{
-+    struct val_t *valp;
-+    u64 id = bpf_get_current_pid_tgid();
-+    u32 pid = id >> 32; // PID is higher part
-+    u64 ts = bpf_ktime_get_ns();
-+
-+    valp = entryinfo.lookup(&id);
-+    if (valp == 0) {
-+        // missed tracing issue or filtered
-+        return 0;
-+    }
-+
-+    valp->trace_time[type] += ts - valp->trace_ts[type];;
-+    return 0;
-+}
-+
-+int trace_pagecache_alloc_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_PAGECACHE_ALLOC);
-+}
-+
-+int trace_pagecache_alloc_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_PAGECACHE_ALLOC);
-+}
-+
-+int trace_journal_start_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_JOURNAL_START);
-+}
-+
-+int trace_journal_start_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_JOURNAL_START);
-+}
-+
-+int trace_block_alloc_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_BLOCK_ALLOC);
-+}
-+
-+int trace_block_alloc_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_BLOCK_ALLOC);
-+}
-+
-+int trace_balance_dirty_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_BALANCE_DIRTY);
-+}
-+
-+int trace_balance_dirty_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_BALANCE_DIRTY);
-+}
-+
-+int trace_aops_write_begin_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_WRITE_BEGIN);
-+}
-+
-+int trace_aops_write_begin_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_WRITE_BEGIN);
-+}
-+
-+int trace_aops_write_end_entry(struct pt_regs *ctx)
-+{
-+	return trace_type_entry(ctx, TRACE_WRITE_END);
-+}
-+
-+int trace_aops_write_end_return(struct pt_regs *ctx)
-+{
-+	return trace_type_return(ctx, TRACE_WRITE_END);
-+}
-+"""
-+
-+devid2name={}
-+num_trans=0
-+
-+def init_dev_name():
-+        global devid2name
-+
-+        f = open("/proc/partitions")
-+        for line in f.readlines():
-+                line = line.strip()
-+                if not len(line):
-+                        continue;
-+
-+                str = line.split()
-+                if str[0].isalpha():
-+                        continue
-+                dev_num = (int(str[0]) << 20) + int(str[1])
-+                devid2name[dev_num] = str[3]
-+        f.close()
-+
-+init_dev_name()
-+
-+name2devid={v:k for k,v in devid2name.items()}
-+
-+if args.device:
-+    bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
-+else:
-+    bpf_text = bpf_text.replace('FILTER_DEV', '0')
-+
-+if min_ms == 0:
-+    bpf_text = bpf_text.replace('FILTER_US', '0')
-+else:
-+    bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_ms * 1000))
-+
-+if args.pid:
-+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
-+else:
-+    bpf_text = bpf_text.replace('FILTER_PID', '0')
-+
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+# kernel->user event data: struct data_t
-+DNAME_INLINE_LEN = 32   # linux/dcache.h
-+TASK_COMM_LEN = 16      # linux/sched.h
-+class Data(ct.Structure):
-+    _fields_ = [
-+        ("dev", ct.c_uint),
-+        ("size", ct.c_ulonglong),
-+        ("offset", ct.c_ulonglong),
-+        ("delta_us", ct.c_ulonglong),
-+        ("pagecache_alloc_us", ct.c_ulonglong),
-+        ("journal_start_us", ct.c_ulonglong),
-+        ("block_alloc_us", ct.c_ulonglong),
-+        ("balance_dirty_us", ct.c_ulonglong),
-+        ("write_begin_us", ct.c_ulonglong),
-+        ("write_end_us", ct.c_ulonglong),
-+        ("sched_delay", ct.c_ulonglong),
-+        ("pid", ct.c_ulonglong),
-+        ("task", ct.c_char * TASK_COMM_LEN),
-+        ("file", ct.c_char * DNAME_INLINE_LEN)
-+    ]
-+
-+# process event
-+def print_event(cpu, data, size):
-+    event = ct.cast(data, ct.POINTER(Data)).contents
-+
-+    print("%-8s %-8s %-14.14s %-6s %-7s %-8d %7.2f %8.2f %10.2f %11.2f %13.2f %11.2f %9.2f %11.2f %s" %
-+	(strftime("%H:%M:%S"), devid2name[event.dev],
-+        event.task.decode(), event.pid, event.size, event.offset / 1024,
-+        float(event.delta_us) / 1000, float(event.pagecache_alloc_us) / 1000,
-+	float(event.journal_start_us) / 1000, float(event.block_alloc_us) / 1000,
-+	float(event.balance_dirty_us) / 1000, float(event.write_begin_us) / 1000,
-+	float(event.write_end_us) / 1000, float(event.sched_delay) / 1000,
-+	event.file.decode()))
-+
-+# initialize BPF
-+b = BPF(text=bpf_text)
-+
-+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry")
-+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
-+
-+# trace address_space_operations .write_begin()
-+b.attach_kprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_entry")
-+b.attach_kprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_entry")
-+b.attach_kretprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_return")
-+b.attach_kretprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_return")
-+
-+# trace address_space_operations .write_end()
-+b.attach_kprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_entry")
-+b.attach_kprobe(event="ext4_write_end", fn_name="trace_aops_write_end_entry")
-+b.attach_kretprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_return")
-+b.attach_kretprobe(event="ext4_write_end", fn_name="trace_aops_write_end_return")
-+
-+# trace page cache allocation
-+b.attach_kprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_entry")
-+b.attach_kretprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_return")
-+
-+# trace journal start
-+b.attach_kprobe(event="jbd2__journal_start", fn_name="trace_journal_start_entry")
-+b.attach_kretprobe(event="jbd2__journal_start", fn_name="trace_journal_start_return")
-+
-+# trace block allocation
-+b.attach_kprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_entry")
-+b.attach_kretprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_return")
-+b.attach_kprobe(event="_ext4_get_block", fn_name="trace_block_alloc_entry")
-+b.attach_kretprobe(event="_ext4_get_block", fn_name="trace_block_alloc_return")
-+
-+# trace balance dirty pages
-+b.attach_kprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_entry")
-+b.attach_kretprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_return")
-+
-+# header
-+if min_ms == 0:
-+    print("Tracing ext4 operations")
-+else:
-+    print("Tracing ext4 operations slower than %d ms" % min_ms)
-+
-+print("%-8s %-8s %-14s %-6s %-7s %-8s %7s %8s %10s %11s %13s %11s %9s %11s %s" % ("TIME",
-+      "DEVICE", "COMM", "PID", "BYTES", "OFF_KB", "LAT(ms)", "pg_alloc", "jbd2_start",
-+      "block_alloc", "balance_dirty", "write_begin", "write_end", "sched_delay", "FILENAME"))
-+
-+# read events
-+b["events"].open_perf_buffer(print_event, page_cnt=64)
-+while 1:
-+    try:
-+        b.perf_buffer_poll()
-+
-+    except KeyboardInterrupt:
-+        exit()
-diff --git a/tools/aliext4writeslower_example.txt b/tools/aliext4writeslower_example.txt
-new file mode 100644
-index 0000000..b702d9d
---- /dev/null
-+++ b/tools/aliext4writeslower_example.txt
-@@ -0,0 +1,68 @@
-+Demonstrations of aliext4writeslower.
-+
-+This script traces key steps of write(2) which maybe time-consuming,
-+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction
-+checkpoint or commit), page cache allocation, block allocation and
-+balance dirty pages, and we also take into account sched delay.
-+With this tool, we can clearly know write(2)'s latency proportion.
-+
-+Usage:
-+usage: aliext4writeslower [-h] [-p PID] [-d DEVICE] [min_ms]
-+
-+Trace common ext4 write(2) slower than a threshold
-+
-+positional arguments:
-+  min_ms                minimum I/O duration to trace, in ms (default 10)
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -p PID, --pid PID     trace this PID only
-+  -d DEVICE, --device DEVICE
-+                        inspect specified device
-+
-+examples:
-+    ./aliext4writeslower             # trace write(2) slower than 10 ms (default)
-+    ./aliext4writeslower 1           # trace write(2) slower than 1 ms
-+    ./aliext4writeslower 0           # trace all operations (warning: verbose)
-+    ./aliext4writeslower -d sda5     # inspect specified device /dev/sda5 containing ext4 fs
-+    ./aliext4writeslower -p 185      # trace PID 185 only
-+
-+
-+Output example:
-+
-+Example 1:
-+# Trace write(2) slower than 5 ms
-+python aliext4writeslower.py 5
-+
-+Tracing ext4 operations slower than 5 ms
-+TIME     DEVICE   COMM           PID    BYTES   OFF_KB   LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME
-+17:14:07 sda5     dd             31149  1048576 473088      5.15     1.02       0.68        0.25          0.54        2.50      1.57        0.00 testfile
-+17:14:07 sda5     dd             31149  1048576 474112      5.27     1.07       0.68        0.27          0.54        2.58      1.59        0.00 testfile
-+17:14:07 sda5     dd             31149  1048576 475136      5.09     1.01       0.69        0.30          0.53        2.52      1.49        0.00 testfile
-+17:14:07 sda5     dd             31149  1048576 607232      5.12     1.09       0.67        0.21          0.54        2.52      1.50        0.00 testfile
-+17:14:07 sda5     dd             31149  1048576 637952      6.88     1.20       1.15        0.34          0.75        3.38      1.97        0.00 testfile
-+17:14:07 sda5     dd             31149  1048576 674816      5.29     1.17       0.68        0.20          0.55        2.60      1.58        0.00 testfile
-+17:14:08 sda5     dd             31149  1048576 738304      5.08     0.99       0.69        0.27          0.54        2.46      1.54        0.00 testfile
-+17:14:08 sda5     dd             31149  1048576 790528      5.03     0.94       0.69        0.25          0.54        2.42      1.50        0.00 testfile
-+17:14:08 sda5     dd             31149  1048576 803840      5.04     0.96       0.67        0.29          0.54        2.45      1.51        0.00 testfile
-+17:14:08 sda5     dd             31149  1048576 870400      5.15     1.10       0.68        0.19          0.55        2.52      1.53        0.00 testfile
-+17:14:09 sda5     dd             31149  1048576 986112      5.08     1.00       0.69        0.27          0.54        2.48      1.51        0.00 testfile
-+
-+
-+Example 2:
-+# Inspect specified device /dev/sda5 containing ext4 fs
-+python aliext4writeslower.py -d sda5
-+
-+Tracing ext4 operations
-+TIME     DEVICE   COMM           PID    BYTES   OFF_KB   LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME
-+17:16:24 sda3     su             32894  292     0           0.03     0.00       0.00        0.00          0.00        0.01      0.00        0.00 lastlog
-+17:16:24 sda3     syslog-ng      990    67      456973      0.07     0.01       0.01        0.00          0.00        0.02      0.01        0.00 secure-20190805
-+17:16:24 sda3     syslog-ng      990    106     456973      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 secure-20190805
-+17:16:24 sda3     syslog-ng      990    67      475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
-+17:16:24 sda3     syslog-ng      990    106     475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
-+17:16:24 sda3     ilogtail       56674  199     3296        0.04     0.00       0.01        0.00          0.00        0.01      0.01        0.00 000147.log
-+17:16:24 sda3     syslog-ng      990    95      456974      0.04     0.01       0.01        0.00          0.00        0.01      0.01        0.00 secure-20190805
-+17:16:24 sda3     syslog-ng      990    95      475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
-+17:16:24 sda3     xperf          2558   874     403         0.03     0.00       0.01        0.00          0.00        0.01      0.01        0.00 xperf.pmu.data.0
-+17:16:25 sda3     xperf          2558   819     403         0.06     0.01       0.01        0.00          0.01        0.02      0.01        0.00 xperf.pmu.data.0
-+17:16:25 sda3     syslog-ng      990    175     456974      0.05     0.01       0.01        0.00          0.00        0.01      0.01        0.00 secure-20190805
-diff --git a/tools/alihardirqs.py b/tools/alihardirqs.py
-new file mode 100755
-index 0000000..b8b0500
---- /dev/null
-+++ b/tools/alihardirqs.py
-@@ -0,0 +1,219 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alihardirqs   Summarize hard IRQ (interrupt) event time.
-+#               Enhanced version of BCC hardirqs.
-+#               For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alihardirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs]
-+#
-+# Copyright (c) 2015 Brendan Gregg.
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+from time import sleep, strftime
-+import argparse
-+
-+# arguments
-+examples = """examples:
-+    ./alihardirqs            # sum hard irq event time
-+    ./alihardirqs -D         # show hard irq event time as histograms
-+    ./alihardirqs -c 0       # show hard irq event time of CPU 0 only
-+    ./alihardirqs -i 25      # show event time of IRQ 25 only
-+    ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
-+    ./alihardirqs 1 10       # print 1 second summaries, 10 times
-+    ./alihardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize hard irq statistics, including total count, time, etc.",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-T", "--timestamp", action="store_true",
-+    help="include timestamp on output")
-+parser.add_argument("-N", "--nanoseconds", action="store_true",
-+    help="output in nanoseconds")
-+parser.add_argument("-D", "--dist", action="store_true",
-+    help="show distributions as histograms")
-+parser.add_argument("-c", "--cpu",
-+    help="output hardIRQ statistics on specific CPU only")
-+parser.add_argument("-i", "--irq",
-+    help="output statistics of specific hardIRQ")
-+parser.add_argument("interval", nargs="?", default=99999999,
-+    help="output interval, in seconds")
-+parser.add_argument("outputs", nargs="?", default=99999999,
-+    help="number of outputs")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+countdown = int(args.outputs)
-+distmode = "1" if args.dist else "0"
-+if args.nanoseconds:
-+    factor = 1
-+    label = "nsecs"
-+else:
-+    factor = 1000
-+    label = "usecs"
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+typedef struct irq_key {
-+    u32 cpu;
-+    u32 irq;
-+    char name[32];
-+} irq_key_t;
-+
-+typedef struct irq_key_slot {
-+    irq_key_t key;
-+    u64 slot;
-+} irq_key_slot_t;
-+
-+typedef struct irq_val {
-+    u64 count;
-+    u64 time;
-+    u64 time_max;
-+} irq_val_t;
-+
-+typedef struct stamp {
-+    u64 ts;
-+    char name[32];
-+} stamp_t;
-+
-+BPF_PERCPU_ARRAY(start, stamp_t, 1);
-+BPF_HASH(res, irq_key_t, irq_val_t);
-+BPF_HISTOGRAM(dist, irq_key_slot_t);
-+
-+
-+TRACEPOINT_PROBE(irq, irq_handler_entry)
-+{
-+    COND_FILTER
-+
-+    u32 idx = 0;
-+    stamp_t stamp = {
-+        .ts = bpf_ktime_get_ns(),
-+    };
-+    TP_DATA_LOC_READ_CONST(stamp.name, name, sizeof(stamp.name));
-+    start.update(&idx, &stamp);
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(irq, irq_handler_exit)
-+{
-+    u64 delta;
-+    u32 cpu = bpf_get_smp_processor_id();
-+
-+    u32 idx = 0;
-+    // fetch timestamp and calculate delta
-+    stamp_t *stampp = start.lookup(&idx);
-+    if (!stampp || stampp->ts == 0) {
-+        return 0;   // missed start
-+    }
-+
-+    delta = bpf_ktime_get_ns() - stampp->ts;
-+    irq_key_t key = {.cpu = cpu, .irq = args->irq};
-+    char *name = (char *)stampp->name;
-+    bpf_probe_read(key.name, sizeof(key.name), name);
-+
-+#if DIST_MODE
-+    irq_key_slot_t key_slot = {
-+        .key = key,
-+        .slot = bpf_log2l(delta / FACTOR),
-+    };
-+    dist.increment(key_slot);
-+#else
-+    struct irq_val *valp, val;
-+    valp = res.lookup(&key);
-+
-+    if (valp) {
-+        valp->count += 1;
-+        valp->time += delta;
-+        if (valp->time_max < delta) {valp->time_max = delta;}
-+    }
-+    else {
-+        val.count = 1;
-+        val.time = val.time_max = delta;
-+        res.update(&key, &val);
-+    }
-+#endif
-+    stamp_t zero = {};
-+    start.update(&idx, &zero);
-+
-+    return 0;
-+}
-+"""
-+
-+cond_filter = ''
-+cpu_filter = "1"
-+if args.cpu:
-+    cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu
-+
-+irq_filter = "1"
-+if args.irq:
-+    irq_filter = "args->irq == %s" % args.irq
-+
-+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter)
-+bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
-+
-+bpf_text = bpf_text.replace('FACTOR', "%d" % factor)
-+bpf_text = bpf_text.replace('DIST_MODE', distmode)
-+
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+def print_section(key):
-+    return "CPU %d, hardIRQ %d (%s)" % (key[0], key[1], key[2].decode('utf-8', 'replace'))
-+
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+print("Tracing hard irq event time... Hit Ctrl-C to end.")
-+
-+if args.dist:
-+    tab = b.get_table("dist")
-+else:
-+    tab = b.get_table("res")
-+
-+
-+# output
-+while (1):
-+    try:
-+        sleep(int(args.interval))
-+    except KeyboardInterrupt:
-+        countdown = 1
-+    print()
-+    if args.timestamp:
-+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-+
-+    if args.dist:
-+        tab.print_log2_hist("time_" + label, "section",
-+            section_print_fn=print_section,
-+            bucket_fn=lambda k: (k.cpu, k.irq, k.name),
-+            bucket_sort_fn=lambda keys: sorted(keys))
-+    else:
-+        cur_cpu = -1
-+        for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)):
-+            if cur_cpu != k.cpu:
-+                print("\nCPU%d:" % k.cpu)
-+                print("%10s %20s %10s %10s %10s" %
-+                    ("HARDIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label))
-+                cur_cpu = k.cpu
-+
-+            print("%10d %20s %10d %10d %10d" %
-+                (k.irq, k.name.decode('utf-8', 'replace'), v.count, v.time / factor, v.time_max / factor))
-+
-+        tab.clear()
-+
-+    countdown -= 1
-+    if countdown == 0:
-+        exit()
-diff --git a/tools/alihardirqs_example.txt b/tools/alihardirqs_example.txt
-new file mode 100644
-index 0000000..caebb47
---- /dev/null
-+++ b/tools/alihardirqs_example.txt
-@@ -0,0 +1,93 @@
-+Demonstrations of alihardirqs, the enhanced version of BCC hardirqs.
-+
-+This program traces hard interrupts (irqs), and stores timing statistics
-+in-kernel for efficiency.
-+
-+
-+Example 1
-+
-+By default, all statistics of all hardirq on all CPUs are sampled,
-+and the output is displayed in a per-hardirq and per-CPU flavor.
-+
-+# alihardirqs
-+Tracing hard irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU3:
-+   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+        31      virtio0-input.0          2          4          2
-+
-+
-+Example 2
-+
-+You can sample statistics on specified CPU only with "-c" option,
-+and the output will be displayed in a per-hardirq flavor.
-+
-+# alihardirqs -c 3
-+Tracing hard irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU3:
-+   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+        31      virtio0-input.0          3         10          7
-+
-+
-+Exampel 3
-+
-+You can sample statistics of specified hardirq only with "-i" option,
-+and the output will be displayed in a per-CPU flavor.
-+
-+# alihardirqs -i 31
-+Tracing hard irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU3:
-+   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+        31      virtio0-input.0          3          8          6
-+
-+
-+Example 4
-+
-+The distribution of hardirq run time can be printed as a histogram with "-D"
-+option. The output will be displayed in a per-hardirq and per-CPU flavor.
-+
-+# alihardirqs -D
-+Tracing hard irq event time... Hit Ctrl-C to end.
-+^C
-+
-+section = CPU 3, hardIRQ 31 (virtio0-input.0)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 1        |****************************************|
-+         2 -> 3          : 0        |                                        |
-+         4 -> 7          : 1        |****************************************|
-+
-+
-+
-+USAGE message:
-+
-+# alihardirqs -h
-+usage: alihardirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ]
-+                       [interval] [outputs]
-+
-+Summarize hard irq statistics, including total count, time, etc.
-+
-+positional arguments:
-+  interval           output interval, in seconds
-+  outputs            number of outputs
-+
-+optional arguments:
-+  -h, --help         show this help message and exit
-+  -T, --timestamp    include timestamp on output
-+  -N, --nanoseconds  output in nanoseconds
-+  -D, --dist         show distributions as histograms
-+  -c CPU, --cpu CPU  output hardIRQ statistics on specific CPU only
-+  -i IRQ, --irq IRQ  output statistics of specific hardIRQ
-+
-+examples:
-+    ./alihardirqs            # sum hard irq event time
-+    ./alihardirqs -D         # show hard irq event time as histograms
-+    ./alihardirqs -c 0       # show hard irq event time of CPU 0 only
-+    ./alihardirqs -i 25      # show event time of IRQ 25 only
-+    ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
-+    ./alihardirqs 1 10       # print 1 second summaries, 10 times
-+    ./alihardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
-diff --git a/tools/alijbd2stat.py b/tools/alijbd2stat.py
-new file mode 100755
-index 0000000..f727fc1
---- /dev/null
-+++ b/tools/alijbd2stat.py
-@@ -0,0 +1,392 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alijbd2stat    Summarize filesystem jbd2 stat.
-+#       For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alijbd2stat [-h] [-t thresh] [-d device]
-+#
-+# Copyright (c) 2019-2021 Alibaba Group.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+#
-+# 2019/08/05 Xiaoguang Wang Created this.
-+
-+from __future__ import print_function
-+from bcc import BPF
-+import ctypes as ct
-+import argparse
-+import time
-+
-+# arguments
-+examples = """examples:
-+    ./alijbd2stat          # summarize filesystem jbd2 stat
-+    ./alijbd2stat -d sda3  # inspect specified device /dev/sda3
-+    ./alijbd2stat -t 10    # show jbd2 handle's context when it runs more than 10ms
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize filesystem jbd2 stat",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-d", "--device", help="inspect specified device")
-+parser.add_argument("-t", "--thresh_time", nargs="?", default=0,
-+    help="show jbd2 handle's context when its run time is greater than this value")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+args = parser.parse_args()
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+#include <linux/sched.h>
-+#include <linux/blkdev.h>
-+#include <linux/jbd2.h>
-+
-+struct handle_stat {
-+	dev_t dev;
-+	u32 type;
-+	u32 line_no;
-+	unsigned long tid;
-+	unsigned long pre_start_time;
-+	unsigned long start_time;
-+	unsigned long end_time;
-+	u64 sched_delay;
-+	u64 kernel_ip;
-+	int kernel_stack_id;
-+	u32 pid;
-+	char comm[TASK_COMM_LEN];
-+};
-+
-+struct transaction_run_stat {
-+	dev_t dev;
-+	u32 handle_count;
-+	u32 blocks;
-+	u32 blocks_logged;
-+	unsigned long tid;
-+	unsigned long wait;
-+	unsigned long request_delay;
-+	unsigned long running;
-+	unsigned long locked;
-+	unsigned long flushing;
-+	unsigned long logging;
-+};
-+
-+/*
-+struct handle_info_per_type {
-+	u32 count
-+};
-+*/
-+
-+BPF_HASH(handle_stat_map, struct task_struct *, struct handle_stat);
-+BPF_STACK_TRACE(stack_traces, 1024);
-+BPF_PERF_OUTPUT(jbd2_handle_stat);
-+BPF_PERF_OUTPUT(transaction_stat);
-+
-+
-+int trace_start_this_handle(struct pt_regs *ctx, journal_t *journal,
-+		handle_t *handle, gfp_t gfp_mask)
-+{
-+	struct handle_stat s;
-+	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
-+	dev_t dev = journal->j_fs_dev->bd_dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	memset(&s, 0, sizeof(struct handle_stat));
-+	s.dev = dev;
-+	s.pre_start_time = bpf_ktime_get_ns();
-+	s.sched_delay = t->sched_info.run_delay;
-+
-+	handle_stat_map.update(&t, &s);
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(jbd2, jbd2_handle_start)
-+{
-+	struct handle_stat *s;
-+	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
-+	dev_t dev = args->dev;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	s = handle_stat_map.lookup(&t);
-+	if (s == NULL)
-+		return 0;
-+
-+	s->start_time = bpf_ktime_get_ns();
-+	s->type = args->type;
-+	s->line_no = args->line_no;
-+	s->tid = args->tid;
-+	return 0;
-+}
-+
-+int trace_jbd2_journal_stop(struct pt_regs *ctx, handle_t *handle)
-+{
-+	transaction_t *transaction = handle->h_transaction;
-+	journal_t *journal;
-+	struct handle_stat *s;
-+	struct handle_stat s2 = {};
-+	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
-+	unsigned long end_time = bpf_ktime_get_ns();
-+	unsigned long run_time;
-+
-+	if (transaction == NULL) {
-+		handle_stat_map.delete(&t);
-+		return 0;
-+	}
-+
-+	journal = transaction->t_journal;
-+	dev_t dev = journal->j_fs_dev->bd_dev;
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	if (handle->h_ref >= 2)
-+		return 0;
-+
-+	s = handle_stat_map.lookup(&t);
-+	if (s == NULL)
-+		return 0;
-+
-+	/*
-+	 * This could happen, jbd2_journal_start_reserved() miss a trace
-+	 * jbd2_handle_start().
-+	 */
-+	if (!s->start_time)
-+		s->start_time = s->pre_start_time;
-+
-+
-+	run_time = end_time - s->pre_start_time;
-+	if (run_time < FILTER_THRESH_TIME * 1000 * 1000LLU) {
-+		handle_stat_map.delete(&t);
-+		return 0;
-+	}
-+
-+	s2.dev = s->dev;
-+	s2.type = s->type;
-+	s2.line_no = s->line_no;
-+	s2.tid = s->tid;
-+	s2.pre_start_time = s->pre_start_time;
-+	s2.start_time = s->start_time;
-+	s2.end_time = end_time;
-+	s2.pid = bpf_get_current_pid_tgid();
-+	s2.sched_delay = t->sched_info.run_delay - s->sched_delay;
-+	bpf_get_current_comm(&s2.comm, sizeof(s2.comm));
-+
-+	s2.kernel_stack_id = stack_traces.get_stackid(ctx, 0);
-+	if (s2.kernel_stack_id >= 0) {
-+		// populate extras to fix the kernel stack
-+		u64 ip = PT_REGS_IP(ctx);
-+		u64 page_offset;
-+
-+		// if ip isn't sane, leave key ips as zero for later checking
-+		#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
-+			// x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
-+			page_offset = __PAGE_OFFSET_BASE;
-+		#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
-+			// x64, 4.17, and later
-+		#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
-+			page_offset = __PAGE_OFFSET_BASE_L5;
-+		#else
-+			page_offset = __PAGE_OFFSET_BASE_L4;
-+		#endif
-+		#else
-+			// earlier x86_64 kernels, e.g., 4.6, comes here
-+			// arm64, s390, powerpc, x86_32
-+			page_offset = PAGE_OFFSET;
-+		#endif
-+
-+		if (ip > page_offset)
-+			s2.kernel_ip = ip;
-+	}
-+
-+	jbd2_handle_stat.perf_submit(ctx, &s2, sizeof(struct handle_stat));
-+	handle_stat_map.delete(&t);
-+	return 0;
-+}
-+
-+TRACEPOINT_PROBE(jbd2, jbd2_run_stats)
-+{
-+	struct transaction_run_stat s;
-+	dev_t dev = args->dev;;
-+
-+	if (FILTER_DEV)
-+		return 0;
-+
-+	memset(&s, 0, sizeof(struct transaction_run_stat));
-+	s.dev = args->dev;
-+	s.tid = args->tid;
-+	s.wait = args->wait;
-+	s.request_delay = args->request_delay;
-+	s.running = args->running;
-+	s.locked = args->locked;
-+	s.flushing = args->flushing;
-+	s.logging = args->logging;
-+	s.handle_count = args->handle_count;
-+	s.blocks = args->blocks;
-+	s.blocks_logged = args->blocks_logged;
-+
-+	transaction_stat.perf_submit(args, &s, sizeof(struct transaction_run_stat));
-+	return 0;
-+}
-+
-+"""
-+devid2name={}
-+num_trans=0
-+
-+def init_dev_name():
-+        global devid2name
-+
-+        f = open("/proc/partitions")
-+        for line in f.readlines():
-+                line = line.strip()
-+                if not len(line):
-+                        continue;
-+
-+                str = line.split()
-+                if str[0].isalpha():
-+                        continue
-+                dev_num = (int(str[0]) << 20) + int(str[1])
-+                devid2name[dev_num] = str[3]
-+        f.close()
-+
-+init_dev_name()
-+
-+name2devid={v:k for k,v in devid2name.items()}
-+
-+if args.device:
-+	bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
-+else:
-+	bpf_text = bpf_text.replace('FILTER_DEV', '0')
-+
-+if args.thresh_time:
-+	bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '%s' % args.thresh_time)
-+else:
-+	bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '9999')
-+
-+# code substitutions
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+
-+b.attach_kprobe(event="start_this_handle", fn_name="trace_start_this_handle")
-+b.attach_kprobe(event="jbd2_journal_stop", fn_name="trace_jbd2_journal_stop")
-+
-+print("Tracing jbd2 stats... Hit Ctrl-C to end.")
-+
-+TASK_COMM_LEN = 16
-+
-+class HandleStat(ct.Structure):
-+	_fields_ = [
-+		("dev", ct.c_uint),
-+		("type", ct.c_uint),
-+		("line_no", ct.c_uint),
-+		("tid", ct.c_ulonglong),
-+		("pre_start_time", ct.c_ulonglong),
-+		("start_time", ct.c_ulonglong),
-+		("end_time", ct.c_ulonglong),
-+		("sched_delay", ct.c_ulonglong),
-+		("kernel_ip", ct.c_ulonglong),
-+		("kernel_stack_id", ct.c_int),
-+		("pid", ct.c_uint),
-+		("comm", ct.c_char * TASK_COMM_LEN),
-+    ]
-+
-+class Data(ct.Structure):
-+	_fields_ = [
-+		("dev", ct.c_uint),
-+		("handle_count", ct.c_uint),
-+		("blocks", ct.c_uint),
-+		("blocks_logged", ct.c_uint),
-+		("tid", ct.c_ulonglong),
-+		("wait", ct.c_ulonglong),
-+		("request_delay", ct.c_ulonglong),
-+		("running", ct.c_ulonglong),
-+		("locked", ct.c_ulonglong),
-+		("flushing", ct.c_ulonglong),
-+		("logging", ct.c_ulonglong),
-+    ]
-+
-+class trans_stat:
-+	def __init__(self, dev):
-+		self.dev=dev
-+		self.handle_count=0
-+		self.blocks=0
-+		self.blocks_logged=0
-+		self.tid=0
-+		slef.wait=0
-+		self.request_delay=0
-+		self.running=0
-+		self.locked=0
-+		self.flushing=0
-+		self.logging=0
-+
-+
-+def stack_id_err(stack_id):
-+	# -EFAULT in get_stackid normally means the stack-trace is not available,
-+	# Such as getting kernel stack trace in userspace code
-+	return (stack_id < 0) and (stack_id != -errno.EFAULT)
-+
-+def aksym(addr):
-+	return b.ksym(addr) + "_[k]".encode()
-+
-+def print_stack(cpu, data, size):
-+	k = ct.cast(data, ct.POINTER(HandleStat)).contents
-+
-+	kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id)
-+
-+	# fix kernel stack
-+	kernel_stack = []
-+	if k.kernel_stack_id >= 0:
-+		for addr in kernel_tmp:
-+			kernel_stack.append(addr)
-+		# the later IP checking
-+		if k.kernel_ip:
-+			kernel_stack.insert(0, k.kernel_ip)
-+
-+	# print default multi-line stack output
-+	if stack_id_err(k.kernel_stack_id):
-+		print("    [Missed Kernel Stack]")
-+	else:
-+		print("comm: %s pid: %u tid: %lu pre_start_time: %lu run_time: %lu sched_delay: %lu" %
-+			  (k.comm, k.pid, k.tid, (k.start_time - k.pre_start_time) / 1000,
-+			  (k.end_time - k.pre_start_time) / 1000, k.sched_delay / 1000 / 1000))
-+		for addr in kernel_stack:
-+			print("    %s" % aksym(addr))
-+	#print("    %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid))
-+	#print()
-+
-+def print_header():
-+	print("\n%-8s %-12s %-4s %-13s %-7s %-6s %-8s %-7s %-12s %-6s %-13s" % ("dev",
-+		  "tid", "wait", "request_delay", "running", "locked", "flushing", "logging",
-+		  "handle_count", "blocks", "blocks_logged"))
-+
-+# process event
-+def print_event(cpu, data, size):
-+	global num_trans
-+
-+	event = ct.cast(data, ct.POINTER(Data)).contents
-+	if (num_trans % 10) == 0:
-+		print_header()
-+
-+	print("%-8s %-12lu %-4lu %-13lu %-7lu %-6lu %-8lu %-7lu %-12lu %-6lu %-13lu" %
-+		  (devid2name[event.dev], event.tid, event.wait, event.request_delay, event.running,
-+		  event.locked, event.flushing, event.logging, event.handle_count, event.blocks,
-+		  event.blocks_logged))
-+	num_trans = num_trans + 1
-+
-+label="usecs"
-+stack_traces = b["stack_traces"]
-+b["jbd2_handle_stat"].open_perf_buffer(print_stack, page_cnt=64)
-+b["transaction_stat"].open_perf_buffer(print_event, page_cnt=64)
-+while 1:
-+    try:
-+        b.perf_buffer_poll()
-+
-+    except KeyboardInterrupt:
-+        exit()
-diff --git a/tools/alijbd2stat_example.txt b/tools/alijbd2stat_example.txt
-new file mode 100644
-index 0000000..a65f12e
---- /dev/null
-+++ b/tools/alijbd2stat_example.txt
-@@ -0,0 +1,83 @@
-+Demonstrations of alijbd2stat.
-+Alijbd2stat can trace every jbd2 handle and every transaction commit
-+info. If one jbd2 handle's running time is greater than user specified
-+threshold time, alijbd2stat will catch this handle's context.
-+
-+Usage:
-+alijbd2stat [-h] [-d DEVICE] [-t [THRESH_TIME]]
-+
-+Summarize filesystem jbd2 stat
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -d DEVICE, --device DEVICE
-+                        inspect specified device
-+  -t [THRESH_TIME], --thresh_time [THRESH_TIME]
-+                        show jbd2 handle's context when its run time is
-+                        greater than this value
-+
-+examples:
-+    ./alijbd2stat          # show jbd2 transaction commit info.
-+    ./alijbd2stat -d sda3  # inspect specified device /dev/sda3.
-+    ./alijbd2stat -t 10    # show jbd2 handle's context when it runs more than 10ms.
-+
-+
-+Output example:
-+
-+Example 1:
-+# Show jbd2 transaction commit info.
-+python alijbd2stat.py
-+
-+dev      tid          wait request_delay running locked flushing logging handle_count blocks blocks_logged
-+sda5     11122351     0    0             5052    0      0        0       61           34     13
-+sda3     13641140     0    0             5280    0      0        0       138          19     20
-+sda5     11122352     0    0             5041    0      0        0       59           35     14
-+sda3     13641141     0    0             5000    0      0        0       113          9      10
-+sda5     11122353     0    0             5069    0      0        0       91           47     18
-+
-+Example 2:
-+# inspect specified device /dev/sda5
-+python alijbd2stat.py  -d sda5
-+
-+dev      tid          wait request_delay running locked flushing logging handle_count blocks blocks_logged
-+sda5     11122461     0    0             5029    0      0        0       75           38     15
-+sda5     11122462     0    0             5107    0      0        0       74           42     17
-+sda5     11122463     0    0             5028    0      0        1       88           50     17
-+sda5     11122464     0    0             5128    0      0        0       71           40     19
-+sda5     11122465     0    0             5243    0      0        0       83           44     13
-+sda5     11122466     0    0             5176    0      0        0       90           63     20
-+sda5     11122467     0    0             5029    0      0        0       60           34     13
-+sda5     11122468     0    0             5028    0      0        1       79           38     19
-+sda5     11122469     0    0             5030    0      0        0       44           27     12
-+sda5     11122470     0    0             5614    0      0        0       106          62     25
-+
-+Example 3:
-+# inspect specified device /dev/sda5, and show jbd2 handle's context when one
-+# jbd2 handle runs more than 1ms.
-+python alijbd2stat.py  -d sda5 -t 1
-+
-+comm: dd pid: 106212 tid: 11151230 pre_start_time: 0 run_time: 585432 sched_delay: 0
-+    jbd2_journal_stop_[k]
-+    jbd2_journal_stop_[k]
-+    ext4_dirty_inode_[k]
-+    __mark_inode_dirty_[k]
-+    generic_write_end_[k]
-+    ext4_da_write_end_[k]
-+    generic_perform_write_[k]
-+    __generic_file_write_iter_[k]
-+    ext4_file_write_iter_[k]
-+    __vfs_write_[k]
-+    vfs_write_[k]
-+    sys_write_[k]
-+    do_syscall_64_[k]
-+    entry_SYSCALL_64_after_swapgs_[k]
-+comm: kworker/u128:3 pid: 83229 tid: 0 pre_start_time: 0 run_time: 589686 sched_delay: 0
-+    jbd2_journal_stop_[k]
-+    jbd2_journal_stop_[k]
-+    ext4_convert_unwritten_extents_[k]
-+    ext4_end_io_rsv_work_[k]
-+    process_one_work_[k]
-+    worker_thread_[k]
-+    kthread_[k]
-+    ret_from_fork_[k]
-+sda5     11151229     0    0             5000    0      0        595     401233       255    36
-diff --git a/tools/alimutexsnoop.py b/tools/alimutexsnoop.py
-new file mode 100755
-index 0000000..d28ce05
---- /dev/null
-+++ b/tools/alimutexsnoop.py
-@@ -0,0 +1,273 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+from __future__ import print_function
-+
-+import argparse
-+import ctypes as ct
-+import os
-+import platform
-+import re
-+import sys
-+import time
-+
-+from bcc import BPF
-+from bcc.utils import printb
-+from datetime import datetime
-+from time import strftime
-+
-+#
-+# alimutexsnoop.py Trace all process that holds a mutex longer than a
-+# specified time interval (500us, etc).
-+#           For Linux, uses BCC, eBPF. Embedded C.
-+#
-+# USAGE: alimutexsnoop [-h] [-t] [-p PID]
-+#
-+# Author: Alvin Zheng
-+_examples = """examples:
-+    alimutexsnoop           # trace all mutexes that are held for a long time
-+    alimutexsnoop -t 50     # set the time threshold (us), mutexes are held longer
-+                              than this time threshold will be printed
-+    alimutexsnoop -p 123    # only trace the specified process
-+"""
-+"""
-+    Copyright (c) 2019 Alvin Zheng, Alibaba, Inc.
-+    Licensed under the Apache License, Version 2.0 (the "License")
-+"""
-+
-+def positive_int(val):
-+    try:
-+        ival = int(val)
-+    except ValueError:
-+        raise argparse.ArgumentTypeError("must be an integer")
-+    if ival <= 0:
-+        raise argparse.ArgumentTypeError("must be positive value")
-+    return ival
-+
-+def _getParser():
-+    parser = argparse.ArgumentParser(
-+        description="Trace all mutexes that have been held for a long time",
-+        formatter_class=argparse.RawDescriptionHelpFormatter,
-+        epilog=_examples)
-+    a = parser.add_argument
-+    a("-t", "--threshold", type=positive_int, default=1000,
-+      help="The mutexes held longer than this time threshold will be traced(us)")
-+    a("-p", "--pid", type=positive_int,
-+      help="trace this PID only")
-+    return parser.parse_args
-+
-+
-+class Global():
-+    parse_args = _getParser()
-+    args = None
-+    argv = None
-+    bpf = None
-+
-+class Data(ct.Structure):
-+    """Event data matching struct data_t in _embedded_c()."""
-+    _TASK_COMM_LEN = 16      # linux/sched.h
-+    _pack_ = 1
-+    _fields_ = [
-+        ("start_time", ct.c_ulonglong),  # task->start_time
-+        ("pid_tgid", ct.c_ulonglong),
-+        ("pid", ct.c_uint),  # task->tgid, thread group id == sys_getpid()
-+        ("mutex_addr", ct.c_ulonglong),
-+        ("k_stack_id", ct.c_int),
-+        ("u_stack_id", ct.c_int),
-+        ("task", ct.c_char * _TASK_COMM_LEN)
-+    ]
-+
-+def _embedded_c(args):
-+    """Generate C programs for mutex_lock and mutex_unlock."""
-+    c = """
-+    EBPF_COMMENT
-+    #include <linux/sched.h>
-+    BPF_STATIC_ASSERT_DEF
-+    BPF_STACK_TRACE(stack_traces,655360);
-+    struct key_t{
-+        u64 pid_tgid;
-+        u64 mutex_addr;
-+    } __attribute__((packed));
-+    BPF_HASH(held_mutexes, struct key_t,u64);
-+    struct data_t{
-+        u64 start_time;
-+        u64 pid_tgid;
-+        u32 pid;
-+        u64 mutex_addr;
-+        u32 k_stack_id;
-+        u32 u_stack_id;
-+        char task[TASK_COMM_LEN];
-+    } __attribute__((packed));
-+    BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA);
-+    BPF_PERF_OUTPUT(results);
-+
-+    int kprobe_ret_mutex_lock(struct pt_regs * ctx,struct mutex * lock)
-+    {
-+        struct task_struct *task = (typeof(task))bpf_get_current_task();
-+        FILTER_SELF_LOCK
-+        if (FILTER_PID) { return 0; }
-+        struct key_t key = {
-+            .pid_tgid=bpf_get_current_pid_tgid(),
-+            .mutex_addr=(u64)((void*)lock)
-+        };
-+        u64 locktime = bpf_ktime_get_ns();
-+        u64 * ptr=held_mutexes.lookup_or_init(&key, &locktime);
-+        if(!ptr){
-+            bpf_trace_printk("could not add held_mutexes, thread: %d, mutex: %p \\n" ,task->tgid,lock);
-+            return 1;
-+        }
-+        return 0;
-+    }
-+    int kprobe_mutex_unlock(struct pt_regs * ctx, struct mutex * lock)
-+    {
-+        struct task_struct *task = (typeof(task))bpf_get_current_task();
-+        FILTER_SELF_UNLOCK
-+        if (FILTER_PID) { return 0; }
-+        struct key_t key = {
-+            .pid_tgid=bpf_get_current_pid_tgid(),
-+            .mutex_addr=(u64)((void*)lock)
-+        };
-+        u64 cur = bpf_ktime_get_ns();
-+        u64 * locktime=held_mutexes.lookup(&key);
-+        if(!locktime){
-+            bpf_trace_printk("cannot find the lock in thread: %d mutex: %p \\n",task->tgid,lock);
-+            return 1;
-+        }
-+        if(cur - (*locktime) > TIME_THRESHOLD){
-+            struct data_t data = {
-+                .start_time=task->start_time,
-+                .pid_tgid=bpf_get_current_pid_tgid(),
-+                .pid=task->pid,
-+                .mutex_addr=(u64)((void*)lock)
-+            };
-+            bpf_get_current_comm(&data.task,sizeof(data.task));
-+            data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID);
-+            data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID|BPF_F_USER_STACK);
-+            results.perf_submit(ctx,&data,sizeof(data));
-+        }
-+        held_mutexes.delete(&key);
-+        return 0;
-+    }
-+    """
-+    # TODO: this macro belongs in bcc/src/cc/export/helpers.h
-+    bpf_static_assert_def = r"""
-+    #ifndef BPF_STATIC_ASSERT
-+    #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \
-+    extern int bpf_static_assert[(condition) ? 1 : -1]
-+    #endif
-+    """
-+    selfpid = os.getpid()
-+    filter_self = r"""
-+    if(task->tgid==%d){
-+        return 0;
-+    }
-+    """ % selfpid
-+    code_substitutions = [
-+        ('EBPF_COMMENT', _ebpf_comment()),
-+        ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def),
-+        ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))),
-+        ("TIME_THRESHOLD", str(Global.args.threshold * 1000)),  # ns->us
-+        ("FILTER_SELF_LOCK", filter_self),
-+        ("FILTER_SELF_UNLOCK", filter_self),
-+        ('FILTER_PID', '0' if not Global.args.pid else "task->pid != %d" % Global.args.pid)
-+    ]
-+    for old, new in code_substitutions:
-+        c = c.replace(old, new)
-+    # print(c)
-+    return c
-+
-+def _ebpf_comment():
-+    """Return a C-style comment with information about the generated code."""
-+    comment = ('Created by %s at %s:\n\t%s' %
-+                    (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__))
-+    args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n')
-+    return ("\n   /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" %
-+                             (comment, ' '.join(Global.argv), args))
-+                   .replace('\n', '\n\t*').replace('\t', '    '))
-+
-+def _print_header():
-+    print("%-16s %-6s  %-16s \n" % ("PCOMM", "PID", "MUTEX_ADDR"))
-+
-+def _print_event(cpu, data, size):  # callback
-+    """Print the long-held mutexes."""
-+    stack_traces = Global.bpf['stack_traces']
-+    e = ct.cast(data, ct.POINTER(Data)).contents
-+    print("%-16s %-6d  %-16x\n" %
-+              (e.task.decode(), e.pid, e.mutex_addr), end="")
-+    # print the kernel stack
-+    if e.k_stack_id >= 0:
-+        cur_stack = reversed(list(stack_traces.walk(e.k_stack_id)))
-+        for addr in cur_stack:
-+            printb(b"Kernel Stack:    %-16x  %s" % (addr, Global.bpf.ksym(addr)))
-+    else:
-+        print("No KERNEL STACK FOUND")
-+    # print the user stack
-+    if e.u_stack_id >= 0:
-+        cur_stack = reversed(list(stack_traces.walk(e.u_stack_id)))
-+        for addr in cur_stack:
-+            printb(b"USER Stack:      %-16x  %s" % (addr, Global.bpf.sym(addr, e.pid_tgid)))
-+    else:
-+        print("No USER STACK FOUND")
-+    print("=================================")
-+    print()
-+# =============================
-+# Module: These functions are available for import
-+# =============================
-+def initialize(arg_list=sys.argv[1:]):
-+    """Trace all long-held mutexes
-+
-+    arg_list - list of args, if omitted then uses command line args
-+               arg_list is passed to argparse.ArgumentParser.parse_args()
-+
-+    Returns a tuple (return_code, result)
-+       0 = Ok, result is the return value from BPF()
-+       1 = args.ebpf is requested, result is the generated C code
-+       os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message
-+       os.EX_SOFTWARE: internal software error, result is error message
-+    """
-+    Global.argv = arg_list
-+    Global.args = Global.parse_args(arg_list)
-+    if os.geteuid() != 0:
-+        return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call")
-+    c = _embedded_c(Global.args)
-+    try:
-+        return (os.EX_OK, BPF(text=c))
-+    except Exception as e:
-+        error = format(e)
-+        return (os.EX_SOFTWARE, "BPF error: " + error)
-+
-+def snoop(bpf, event_handler):
-+    """Call event_handler when release a lock that is held for a long time
-+    bpf - result returned by successful initialize()
-+    event_handler - callback function to handle termination event
-+    args.pid - Return after event_handler is called, only monitoring this pid
-+    """
-+    bpf["results"].open_perf_buffer(event_handler)
-+    while True:
-+        bpf.perf_buffer_poll()
-+        if Global.args.pid:
-+            if not os.path.exists('/proc/%d' % Global.args.pid):
-+                # the target process has exited
-+                break
-+
-+# =============================
-+# Script: invoked as a script
-+# =============================
-+def main():
-+    try:
-+        rc, buffer = initialize()
-+        if rc != os.EX_OK:
-+            print(buffer)
-+            sys.exit(-1)
-+        Global.bpf = buffer
-+        Global.bpf.attach_kretprobe(event="mutex_lock", fn_name="kprobe_ret_mutex_lock")
-+        Global.bpf.attach_kprobe(event="mutex_unlock", fn_name="kprobe_mutex_unlock")
-+        _print_header()
-+        snoop(buffer, _print_event)
-+    except KeyboardInterrupt:
-+        print()
-+        sys.exit()
-+
-+    return 0
-+
-+if __name__ == '__main__':
-+    main()
-diff --git a/tools/alimutexsnoop_example.txt b/tools/alimutexsnoop_example.txt
-new file mode 100644
-index 0000000..67ca03e
---- /dev/null
-+++ b/tools/alimutexsnoop_example.txt
-@@ -0,0 +1,79 @@
-+Demonstrations of alimutexsnoop.
-+
-+This profile tool monitors the mutex that is held by the process longer than
-+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes,
-+it prints the pid of the process that held the mutex and the address of the
-+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the
-+process.
-+
-+
-+Usage:
-+alimutexsnoop.py [-h] [-t THRESHOLD] [-p PID]
-+
-+Trace all mutexes that have been held for a long time
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -t THRESHOLD, --threshold THRESHOLD
-+                        The mutexes held longer than this time threshold will
-+                        be traced(us)
-+  -p PID, --pid PID     trace this PID only
-+
-+examples:
-+    alimutexsnoop           # trace all mutexes that are held for a long time
-+    alimutexsnoop -t 50     # set the time threshold (us), mutexes are held longer
-+                              than this time threshold will be printed
-+    alimutexsnoop -p 123    # only trace the specified process
-+
-+
-+Output example:
-+
-+python alimutexsnoop.py -t 20 # monitor the mutexes that are held by processes longer than 20 us
-+
-+PCOMM            PID     MUTEX_ADDR
-+
-+AliYunDun        4283    ffffa30fd1b4ea48
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa52316fc  ksys_read
-+Kernel Stack:    ffffffffa57cfd21  mutex_unlock
-+USER Stack:      7f1532c4e6fd      read
-+=================================
-+
-+AliYunDun        4283    ffffa310d7e43c40
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa52316ba  ksys_read
-+Kernel Stack:    ffffffffa5231149  vfs_read
-+Kernel Stack:    ffffffffa5230f86  __vfs_read
-+Kernel Stack:    ffffffffa5254b30  seq_read
-+Kernel Stack:    ffffffffa57cfd21  mutex_unlock
-+USER Stack:      7f1532c4e6fd      read
-+=================================
-+
-+AliYunDun        4283    ffffa30fd1b4e348
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa52316fc  ksys_read
-+Kernel Stack:    ffffffffa57cfd21  mutex_unlock
-+USER Stack:      7f1532c4e6fd      read
-+=================================
-+
-+AliYunDunUpdate  4256    ffffa310d1146a40
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa52316ba  ksys_read
-+Kernel Stack:    ffffffffa5231149  vfs_read
-+Kernel Stack:    ffffffffa5230f86  __vfs_read
-+Kernel Stack:    ffffffffa5254b30  seq_read
-+Kernel Stack:    ffffffffa57cfd21  mutex_unlock
-+USER Stack:      7fbce69966fd      read
-+=================================
-+
-+AliYunDunUpdate  4256    ffffa310d3b48a48
-+Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
-+Kernel Stack:    ffffffffa500201b  do_syscall_64
-+Kernel Stack:    ffffffffa52316fc  ksys_read
-+Kernel Stack:    ffffffffa57cfd21  mutex_unlock
-+USER Stack:      7fbce69966fd      read
-+=================================
-diff --git a/tools/alirunqinfo.py b/tools/alirunqinfo.py
-new file mode 100755
-index 0000000..102c763
---- /dev/null
-+++ b/tools/alirunqinfo.py
-@@ -0,0 +1,360 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alirunqinfo  Summarize task on-CPU time before being descheduled.
-+#              For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alirunqinfo [-h] [-T] [-N] [-D] [-C] [-P] [-c CPU] [-p PID]
-+#                    [interval] [outputs]
-+#
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+from time import sleep, strftime
-+import argparse
-+import ctypes
-+import signal
-+
-+# arguments
-+examples = """examples:
-+    ./alirunqinfo            # sum runqueue statistics
-+    ./alirunqinfo -C         # show per-CPU runqueue statistics
-+    ./alirunqinfo -P         # show per-thread runqueue statistics
-+    ./alirunqinfo -D         # show runqueue statistics as histograms
-+    ./alirunqinfo -c 0       # show runqueue statistics of CPU 0 only
-+    ./alirunqinfo -p 25      # show runqueue statistics of PID 25 only
-+    ./alirunqinfo 1 10       # print 1 second summaries, 10 times
-+    ./alirunqinfo -NT 1      # 1s summaries, nanoseconds, and timestamps
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize runqueue statistics.",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-T", "--timestamp", action="store_true",
-+    help="include timestamp on output")
-+parser.add_argument("-N", "--nanoseconds", action="store_true",
-+    help="output in nanoseconds")
-+parser.add_argument("-C", "--cpus", action="store_true",
-+    help="show per-CPU runqueue statistics")
-+parser.add_argument("-P", "--pids", action="store_true",
-+    help="show per-thread runqueue statistics")
-+parser.add_argument("-D", "--dist", action="store_true",
-+    help="show distributions as histograms")
-+parser.add_argument("-c", "--cpu",
-+    help="show runqueue statistics on specific CPU only")
-+parser.add_argument("-p", "--pid",
-+    help="show runqueue statistics of specific PID only")
-+parser.add_argument("interval", nargs="?", default=99999999,
-+    help="output interval, in seconds")
-+parser.add_argument("outputs", nargs="?", default=99999999,
-+    help="number of outputs")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+countdown = int(args.outputs)
-+if args.pid:
-+    args.pids = True
-+if args.cpu:
-+    args.cpus = True
-+if (not args.cpus) and (not args.pids):
-+    args.cpus = True
-+
-+if args.nanoseconds:
-+    factor = 1
-+    label = "nsecs"
-+else:
-+    factor = 1000
-+    label = "usecs"
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+typedef struct time_stamp {
-+    u64 stamp;
-+    u32 pid;
-+} time_stamp_t;
-+
-+BPF_PERCPU_ARRAY(start, time_stamp_t, 1);
-+
-+STORAGE
-+
-+
-+static inline int sched_switch_exit(u32 cpu, u32 pid, u64 now, time_stamp_t *new) {
-+    PID_FILTER
-+
-+    u32 idx = 0;
-+    // fetch timestamp and calculate delta
-+    time_stamp_t *stampp = start.lookup(&idx);
-+    if (!stampp || stampp->pid != pid) {
-+        return 0;   // missed start
-+    }
-+    u64 delta = now - stampp->stamp;
-+    start.update(&idx, new);
-+
-+    UPDATE
-+
-+    return 0;
-+}
-+
-+static inline int sched_switch_enter(u32 pid, u64 now, time_stamp_t *new) {
-+    PID_FILTER
-+
-+    u32 idx = 0;
-+    new->stamp = now;
-+    new->pid = pid;
-+
-+    start.update(&idx, new);
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(sched, sched_switch)
-+{
-+    u32 cpu = bpf_get_smp_processor_id();
-+    CPU_FILTER
-+
-+    u64 now = bpf_ktime_get_ns();
-+    u32 pid, idx = 0;
-+    time_stamp_t new = {};
-+
-+    pid = args->prev_pid;
-+    if (pid != 0) {
-+        sched_switch_exit(cpu, pid, now, &new);
-+    }
-+
-+    pid = args->next_pid;
-+    if (pid != 0) {
-+        sched_switch_enter(pid, now, &new);
-+    }
-+
-+    return 0;
-+}
-+"""
-+
-+
-+# replace STORAGE
-+map_key = """
-+typedef struct map_key {
-+    u32 cpu;
-+    u32 pid;
-+} map_key_t;
-+"""
-+
-+map_val = """
-+typedef struct map_val {
-+    u64 time;
-+    u64 count;
-+} map_val_t;
-+"""
-+
-+if args.dist:
-+    # diagram mode, we need BPF_HISTOGRAM
-+    if args.cpus and args.pids:
-+        storage = map_key + """
-+typedef struct map_key_slot {
-+    map_key_t key;
-+    u64 slot;
-+} map_key_slot_t;
-+"""
-+    else:
-+        storage = """
-+typedef struct map_key_slot {
-+    u64 key;
-+    u64 slot;
-+} map_key_slot_t;
-+"""
-+    storage += """BPF_HISTOGRAM(dist, map_key_slot_t);"""
-+
-+else:
-+    # normal mode, we need BPF_HASH
-+    storage = map_val
-+    if args.cpus and args.pids:
-+        storage += map_key + """BPF_HASH(res, map_key_t, map_val_t);"""
-+    else:
-+        storage += """BPF_HASH(res, u32, map_val_t);"""
-+
-+bpf_text = bpf_text.replace('STORAGE', storage)
-+
-+
-+# replace UPDATE
-+if args.dist:
-+    if args.cpus and args.pids:
-+        key_exp = "{.cpu = cpu, .pid = pid,}"
-+    elif args.cpus:
-+        key_exp = "cpu"
-+    elif args.pids:
-+        key_exp = "pid"
-+
-+    update = """
-+    map_key_slot_t key_slot = {
-+        .key = %s,
-+        .slot = bpf_log2l(delta / FACTOR),
-+    };
-+    dist.increment(key_slot);
-+""" % key_exp
-+
-+else:
-+    if args.cpus and args.pids:
-+        update = """
-+    map_key_t key = {
-+        .cpu = cpu,
-+        .pid = pid,
-+    };
-+"""
-+    elif args.cpus:
-+        update = "u32 key = cpu;"
-+    elif args.pids:
-+        update = "u32 key = pid;"
-+
-+    update += """
-+    map_val_t *valp = res.lookup(&key);
-+    if (valp) {
-+        valp->count += 1;
-+        valp->time += delta;
-+    }
-+    else {
-+        map_val_t val = {
-+            .count = 1,
-+            .time  = delta,
-+        };
-+        res.update(&key, &val);
-+    }
-+"""
-+bpf_text = bpf_text.replace('UPDATE', update)
-+bpf_text = bpf_text.replace('FACTOR', str(factor))
-+
-+
-+# replace FILTER
-+cpu_filter = "if (cpu != %s) {return 0;}" % args.cpu if args.cpu else ""
-+bpf_text = bpf_text.replace('CPU_FILTER', cpu_filter)
-+
-+pid_filter = "if (pid != %s) {return 0;}" % args.pid if args.pid else ""
-+bpf_text = bpf_text.replace('PID_FILTER', pid_filter)
-+
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+
-+# print header
-+print("Tracing runqueue statistics", end="")
-+if args.pid:
-+    print(" of PID %s" % args.pid, end="")
-+if args.cpu:
-+    print(" on CPU %s" % args.cpu, end="")
-+if args.dist:
-+    print(", in diagram mode", end="")
-+print("...Hit Ctrl-C to end.")
-+
-+if args.dist:
-+    tab = b.get_table("dist")
-+else:
-+    tab = b.get_table("res")
-+
-+
-+def pid_to_comm(pid):
-+    try:
-+        return open("/proc/%d/comm" % pid, "r").read().rstrip("\n")
-+    except IOError:
-+        return ""
-+
-+
-+def print_per_cpu_thread(tab):
-+    cur_cpu = -1
-+    # k is struct map_key; v is struct map_val
-+    for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].pid)):
-+        if cur_cpu != k.cpu:
-+            print("\nCPU%d:" % k.cpu)
-+            print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label))
-+            cur_cpu = k.cpu
-+        print("%10d %40s %10d %10d" % (k.pid, pid_to_comm(k.pid), v.count, v.time / factor))
-+
-+
-+def print_per_cpu(tab):
-+    print("%10s %10s %10s" % ("CPU", "Count", "TOTAL_" + label))
-+    # k is c_uint (representing CPU), v is struct map_val
-+    for k, v in sorted(tab.items(), key=lambda res: res[0].value):
-+        print("%10d %10d %10d" % (k.value, v.count, v.time / factor))
-+
-+
-+def print_per_thread(tab):
-+    print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label))
-+    # k is c_uint (representing PID), v is struct map_val
-+    for k, v in sorted(tab.items(), key=lambda res: res[0].value):
-+        print("%10d %40s %10d %10d" % (k.value, pid_to_comm(k.value), v.count, v.time / factor))
-+
-+
-+def print_dist_per_cpu_thread(tab):
-+    tab.print_log2_hist("time_" + label, "section",
-+            # k is (cpuN, pidN) formatted by bucket_fn
-+            section_print_fn=lambda k: ("CPU %d, PID %d (%s)" % (k[0], k[1], pid_to_comm(k[1]))),
-+            bucket_fn=lambda k: (k.cpu, k.pid),  # k is struct map_key
-+            bucket_sort_fn=lambda k: sorted(k))  # k is struct map_key
-+
-+
-+def print_dist_per_cpu(tab):
-+    tab.print_log2_hist("time_" + label, "CPU",
-+            section_print_fn=lambda k: str(k),   # k is u64 representing CPU
-+            bucket_sort_fn=lambda k: sorted(k))  # k is u64 representing CPU
-+
-+
-+def print_dist_per_thread(tab):
-+    tab.print_log2_hist("time_" + label, "Thread",
-+            section_print_fn=lambda k: ("%d (%s)" % (k, pid_to_comm(k))),  # k is u64 representing PID
-+            bucket_sort_fn=lambda k: sorted(k))  # k is u64 representing PID
-+
-+
-+# signal handler
-+def signal_ignore(signal, frame):
-+    print()
-+
-+
-+# output
-+while (1):
-+    try:
-+        sleep(int(args.interval))
-+    except KeyboardInterrupt:
-+        # as cleanup can take many seconds, trap Ctrl-C:
-+        signal.signal(signal.SIGINT, signal_ignore)
-+        countdown = 1
-+
-+    print()
-+    if args.timestamp:
-+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-+
-+    if args.dist:
-+        if args.cpus and args.pids:
-+            print_dist_per_cpu_thread(tab)
-+        elif args.cpus:
-+            print_dist_per_cpu(tab)
-+        elif args.pids:
-+            print_dist_per_thread(tab)
-+        else:
-+            print("Invalid parameter")
-+            countdown = 1
-+    else:
-+        if args.cpus and args.pids:
-+            print_per_cpu_thread(tab)
-+        elif args.cpus:
-+            print_per_cpu(tab)
-+        elif args.pids:
-+            print_per_thread(tab)
-+        else:
-+            print("Invalid parameter")
-+            countdown = 1
-+
-+    tab.clear()
-+
-+    countdown -= 1
-+    if countdown == 0:
-+        exit()
-diff --git a/tools/alirunqinfo_example.txt b/tools/alirunqinfo_example.txt
-new file mode 100644
-index 0000000..a1c24ca
---- /dev/null
-+++ b/tools/alirunqinfo_example.txt
-@@ -0,0 +1,203 @@
-+This program summarize runqueue statistics, that is, task on-CPU time
-+before being descheduled, in flavor of per-CPU, per-PID, or histogram.
-+
-+There are major two categories of parameters:
-+1. one category of parameters control the sampling source, such as
-+-c[--cpu] and -p[--pid]
-+2. another category of parameters control the displaying format,
-+such as -C[--cpus], -P[--pids] and -D[--dist]
-+
-+These two categories of parameters are Orthogonal.
-+
-+
-+Example 1.
-+
-+By default, the program will sample all processes on all runqueues
-+of all CPUs. Statistics of all processes on the same CPU is
-+aggregated and displayed in a per-CPU flavor.
-+
-+In this case, "alirunqinfo" is the same as "alirunqinfo -C".
-+
-+# alirunqinfo
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+       CPU      Count TOTAL_usecs
-+         0         77        207
-+         1          9         27
-+         2          3         14
-+         3          2         15
-+         4         45        122
-+         6         41        380
-+         7          6         25
-+         9         18         61
-+        10         19         50
-+        12          2         19
-+        14          2        189
-+        16          1         13
-+        17         18         49
-+        20         36         73
-+        21         18         35
-+        22         22        109
-+        24         25         92
-+        28         21         68
-+        30          4         95
-+
-+
-+Example 2.
-+
-+Make the output displayed in a per-process flavor with "-P" parameter.
-+In this case, all processes on all CPUs are sampled as usual, while
-+all statistics of the same process is aggregated, whichever CPU it
-+runs on.
-+
-+# alirunqinfo -P
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+       PID                                   Thread      Count TOTAL_usecs
-+        10                                rcu_sched        122        264
-+       382                kworker/14:1-mm_percpu_wq          1         20
-+      2926                                     rngd          1          2
-+      3327                kworker/16:2-mm_percpu_wq          1         15
-+      3384                             in:imjournal          2          8
-+      3428                          AliYunDunUpdate         20         60
-+      3429                          AliYunDunUpdate         20         40
-+      3442                           aliyun-service         20         98
-+      3449                                AliYunDun         20        137
-+      3450                                AliYunDun         19         58
-+      3451                                AliYunDun         20         56
-+      3462                                AliYunDun         19         46
-+      3463                                AliYunDun         19        100
-+      3464                                AliYunDun          2        220
-+      3467                                AliYunDun          2          6
-+      3468                                AliYunDun         26         61
-+      3469                                AliYunDun         39         78
-+      3470                                AliYunDun         39        125
-+      3471                                AliYunDun          2         10
-+      3473                                AliYunDun          2          4
-+      3474                                AliYunDun          1          4
-+      3475                                AliYunDun          2         11
-+      3476                                AliYunDun         10         30
-+      3533                                     sshd          5        193
-+      4573                kworker/27:2-mm_percpu_wq          2         13
-+      5016                kworker/24:0-mm_percpu_wq          1          5
-+      5281       kworker/4:0-events_power_efficient          9         19
-+      5999       kworker/0:2-events_power_efficient          2         34
-+      6429             kworker/u64:0-events_unbound          3         14
-+      6468             kworker/u64:1-events_unbound          2         37
-+
-+
-+Example 3.
-+
-+You can set "-P" and "-C" parameter at the same time, in which case, the
-+output is displayed in a per-CPU and per-process flavor.
-+
-+# alirunqinfo -PC
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+
-+CPU0:
-+       PID                                   Thread      Count TOTAL_usecs
-+        10                                rcu_sched         72        147
-+      3467                                AliYunDun          1          6
-+
-+CPU1:
-+       PID                                   Thread      Count TOTAL_usecs
-+      3476                                AliYunDun          5         26
-+
-+CPU2:
-+       PID                                   Thread      Count TOTAL_usecs
-+      3384                             in:imjournal          2         12
-+
-+CPU3:
-+       PID                                   Thread      Count TOTAL_usecs
-+      2926                                     rngd          2          7
-+
-+
-+Example 4.
-+
-+Sample specified process only with "-p", and "-P" is added implicitly
-+when "-p" is specified.
-+
-+# alirunqinfo -p 3429
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+       PID                                   Thread      Count TOTAL_usecs
-+      3429                          AliYunDunUpdate         14         40
-+
-+
-+Example 5.
-+
-+When both "-p" and "-C" parameters are specified, the output is displayed
-+in per-CPU and per-process flavor.
-+
-+# alirunqinfo -p 3429 -C
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+
-+CPU22:
-+       PID                                   Thread      Count TOTAL_usecs
-+      3429                          AliYunDunUpdate         29         67
-+
-+
-+Example 6.
-+
-+Sample processes on specified CPU only with "-c", and "-C" is added
-+implicitly when "-c" is specified.
-+
-+# alirunqinfo -c 22
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+       CPU      Count TOTAL_usecs
-+        22         24         90
-+
-+
-+Example 7.
-+
-+Make the output displayed in a diagram when "-D" is specified.
-+
-+# alirunqinfo -p 3429 -D
-+Tracing runqueue statistics... Hit Ctrl-C to end.
-+^C
-+
-+Thread = 3429 (AliYunDunUpdate)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 22       |****************************************|
-+         2 -> 3          : 12       |*********************                   |
-+         4 -> 7          : 0        |                                        |
-+         8 -> 15         : 0        |                                        |
-+        16 -> 31         : 0        |                                        |
-+        32 -> 63         : 1        |*                                       |
-+
-+
-+
-+USAGE message:
-+
-+# alirunqinfo -h
-+usage: alirunqinfo.py [-h] [-T] [-N] [-C] [-P] [-D] [-c CPU] [-p PID]
-+                   [interval] [outputs]
-+
-+Summarize runqueue statistics, including total count, time, etc.
-+
-+positional arguments:
-+  interval           output interval, in seconds
-+  outputs            number of outputs
-+
-+optional arguments:
-+  -h, --help         show this help message and exit
-+  -T, --timestamp    include timestamp on output
-+  -N, --nanoseconds  output in nanoseconds
-+  -C, --cpus         show per-CPU runqueue statistics
-+  -P, --pids         show per-thread runqueue statistics
-+  -D, --dist         show distributions as histograms
-+  -c CPU, --cpu CPU  show runqueue statistics on specific CPU only
-+  -p PID, --pid PID  show runqueue statistics of specific PID only
-+
-+examples:
-+    ./alirunqinfo            # sum runqueue statistics
-+    ./alirunqinfo -C         # show per-CPU runqueue statistics
-+    ./alirunqinfo -P         # show per-thread runqueue statistics
-+    ./alirunqinfo -D         # show runqueue statistics as histograms
-+    ./alirunqinfo -c 0       # show runqueue statistics of CPU 0 only
-+    ./alirunqinfo -p 25      # show runqueue statistics of PID 25 only
-+    ./alirunqinfo 1 10       # print 1 second summaries, 10 times
-+    ./alirunqinfo -NT 1      # 1s summaries, nanoseconds, and timestamps
-diff --git a/tools/alisoftirqs.py b/tools/alisoftirqs.py
-new file mode 100755
-index 0000000..202114f
---- /dev/null
-+++ b/tools/alisoftirqs.py
-@@ -0,0 +1,215 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alisoftirqs   Summarize soft IRQ (interrupt) event time.
-+#               Enhanced version of BCC softirqs.
-+#               For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alisoftirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs]
-+#
-+# Copyright (c) 2015 Brendan Gregg.
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+from time import sleep, strftime
-+import argparse
-+
-+# arguments
-+examples = """examples:
-+    ./alisoftirqs            # sum soft irq event time
-+    ./alisoftirqs -D         # show soft irq event time as histograms
-+    ./alisoftirqs -c 0       # show soft irq event time of CPU 0 only
-+    ./alisoftirqs -i 25      # show event time of IRQ 25 only
-+    ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
-+    ./alisoftirqs 1 10       # print 1 second summaries, 10 times
-+    ./alisoftirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize soft irq statistics, including total count, time, etc.",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-T", "--timestamp", action="store_true",
-+    help="include timestamp on output")
-+parser.add_argument("-N", "--nanoseconds", action="store_true",
-+    help="output in nanoseconds")
-+parser.add_argument("-D", "--dist", action="store_true",
-+    help="show distributions as histograms")
-+parser.add_argument("-c", "--cpu",
-+    help="output softIRQ statistics on specific CPU only")
-+parser.add_argument("-i", "--irq",
-+    help="output statistics of specific softIRQ")
-+parser.add_argument("interval", nargs="?", default=99999999,
-+    help="output interval, in seconds")
-+parser.add_argument("outputs", nargs="?", default=99999999,
-+    help="number of outputs")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+countdown = int(args.outputs)
-+distmode = "1" if args.dist else "0"
-+if args.nanoseconds:
-+    factor = 1
-+    label = "nsecs"
-+else:
-+    factor = 1000
-+    label = "usecs"
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+typedef struct irq_key {
-+    u32 cpu;
-+    u32 irq;
-+} irq_key_t;
-+
-+typedef struct irq_key_slot {
-+    irq_key_t key;
-+    u64 slot;
-+} irq_key_slot_t;
-+
-+typedef struct irq_val {
-+    u64 count;
-+    u64 time;
-+    u64 time_max;
-+} irq_val_t;
-+
-+BPF_PERCPU_ARRAY(start, u64, 1);
-+BPF_HASH(res, irq_key_t, irq_val_t);
-+BPF_HISTOGRAM(dist, irq_key_slot_t);
-+
-+
-+TRACEPOINT_PROBE(irq, softirq_entry)
-+{
-+    COND_FILTER
-+
-+    u32 idx = 0;
-+    u64 ts = bpf_ktime_get_ns();
-+    start.update(&idx, &ts);
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(irq, softirq_exit)
-+{
-+    u64 *tsp, delta;
-+    u32 cpu = bpf_get_smp_processor_id();
-+    u32 irq = args->vec;
-+
-+    u32 idx = 0;
-+    // fetch timestamp and calculate delta
-+    tsp = start.lookup(&idx);
-+    if (!tsp || *tsp == 0) {
-+        return 0;   // missed start
-+    }
-+
-+    delta = bpf_ktime_get_ns() - *tsp;
-+    irq_key_t key = {.cpu = cpu, .irq = irq};
-+
-+#if DIST_MODE
-+    irq_key_slot_t key_slot = {
-+        .key = key,
-+        .slot = bpf_log2l(delta / FACTOR),
-+    };
-+    dist.increment(key_slot);
-+#else
-+    struct irq_val *valp, val;
-+    valp = res.lookup(&key);
-+
-+    if (valp) {
-+        valp->count += 1;
-+        valp->time += delta;
-+        if (valp->time_max < delta) {valp->time_max = delta;}
-+    }
-+    else {
-+        val.count = 1;
-+        val.time = val.time_max = delta;
-+        res.update(&key, &val);
-+    }
-+#endif
-+    u64 zero = 0;
-+    start.update(&idx, &zero);
-+
-+    return 0;
-+}
-+"""
-+
-+cond_filter = ''
-+cpu_filter = "1"
-+if args.cpu:
-+    cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu
-+
-+irq_filter = "1"
-+if args.irq:
-+    irq_filter = "args->vec == %s" % args.irq
-+
-+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter)
-+bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
-+
-+bpf_text = bpf_text.replace('FACTOR', "%d" % factor)
-+bpf_text = bpf_text.replace('DIST_MODE', distmode)
-+
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+def vec_to_name(vec):
-+    # copied from softirq_to_name() in kernel/softirq.c
-+    # may need updates if new softirq handlers are added
-+    return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
-+            "tasklet", "sched", "hrtimer", "rcu"][vec]
-+
-+def print_section(key):
-+    return "CPU %d, softIRQ %d (%s)" % (key[0], key[1], vec_to_name(key[1]))
-+
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+print("Tracing soft irq event time... Hit Ctrl-C to end.")
-+
-+if args.dist:
-+    tab = b.get_table("dist")
-+else:
-+    tab = b.get_table("res")
-+
-+
-+# output
-+while (1):
-+    try:
-+        sleep(int(args.interval))
-+    except KeyboardInterrupt:
-+        countdown = 1
-+    print()
-+    if args.timestamp:
-+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-+
-+    if args.dist:
-+        tab.print_log2_hist("time_" + label, "section",
-+            section_print_fn=print_section,
-+            bucket_fn=lambda k: (k.cpu, k.irq),
-+            bucket_sort_fn=lambda keys: sorted(keys))
-+    else:
-+        cur_cpu = -1
-+        for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)):
-+            if cur_cpu != k.cpu:
-+                print("\nCPU%d:" % k.cpu)
-+                print("%10s %20s %10s %10s %10s" %
-+                    ("SOFTIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label))
-+                cur_cpu = k.cpu
-+
-+            print("%10d %20s %10d %10d %10d" %
-+                (k.irq, vec_to_name(k.irq), v.count, v.time / factor, v.time_max / factor))
-+
-+        tab.clear()
-+
-+    countdown -= 1
-+    if countdown == 0:
-+        exit()
-diff --git a/tools/alisoftirqs_example.txt b/tools/alisoftirqs_example.txt
-new file mode 100644
-index 0000000..48e0023
---- /dev/null
-+++ b/tools/alisoftirqs_example.txt
-@@ -0,0 +1,128 @@
-+Demonstrations of alisoftirqs, the enhanced version of BCC softirqs.
-+
-+This program traces soft interrupts (irqs), and stores timing statistics
-+in-kernel for efficiency.
-+
-+
-+Example 1
-+
-+By default, all statistics of all softirq on all CPUs are sampled,
-+and the output is displayed in a per-softirq and per-CPU flavor.
-+
-+# alisoftirqs
-+Tracing soft irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU0:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         7                sched          1         59         59
-+
-+CPU3:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer          1          0          0
-+         3               net_rx          6         29          9
-+         7                sched          1          1          1
-+         9                  rcu          1          0          0
-+
-+CPU4:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer         29         11          1
-+         7                sched         11         26          4
-+         9                  rcu         19         22          5
-+
-+
-+Example 2
-+
-+You can sample statistics on specified CPU only with "-c" option,
-+and the output will be displayed in a per-softirq flavor.
-+
-+# alisoftirqs -c 6
-+Tracing soft irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU6:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer          7          2          0
-+         7                sched          2          9          8
-+         9                  rcu          5         10          5
-+
-+
-+Exampel 3
-+
-+You can sample statistics of specified softirq only with "-i" option,
-+and the output will be displayed in a per-CPU flavor.
-+
-+# alisoftirqs -i 1
-+Tracing soft irq event time... Hit Ctrl-C to end.
-+^C
-+
-+CPU0:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer          1          2          2
-+
-+CPU4:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer         26         11          2
-+
-+CPU7:
-+   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
-+         1                timer          2          5          4
-+
-+
-+
-+Example 4
-+
-+The distribution of softirq run time can be printed as a histogram with "-D"
-+option. The output will be displayed in a per-softirq and per-CPU flavor.
-+
-+# alisoftirqs -D
-+Tracing soft irq event time... Hit Ctrl-C to end.
-+^C
-+
-+section = CPU 3, softIRQ 3 (net_rx)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 0        |                                        |
-+         2 -> 3          : 2        |****************************************|
-+         4 -> 7          : 0        |                                        |
-+         8 -> 15         : 1        |********************                    |
-+
-+section = CPU 4, softIRQ 1 (timer)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 13       |****************************************|
-+
-+section = CPU 4, softIRQ 7 (sched)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 2        |****************************************|
-+         2 -> 3          : 1        |********************                    |
-+         4 -> 7          : 2        |****************************************|
-+
-+
-+
-+USAGE message:
-+
-+# alisoftirqs -h
-+usage: alisoftirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ]
-+                       [interval] [outputs]
-+
-+Summarize soft irq statistics, including total count, time, etc.
-+
-+positional arguments:
-+  interval           output interval, in seconds
-+  outputs            number of outputs
-+
-+optional arguments:
-+  -h, --help         show this help message and exit
-+  -T, --timestamp    include timestamp on output
-+  -N, --nanoseconds  output in nanoseconds
-+  -D, --dist         show distributions as histograms
-+  -c CPU, --cpu CPU  output softIRQ statistics on specific CPU only
-+  -i IRQ, --irq IRQ  output statistics of specific softIRQ
-+
-+examples:
-+    ./alisoftirqs            # sum soft irq event time
-+    ./alisoftirqs -D         # show soft irq event time as histograms
-+    ./alisoftirqs -c 0       # show soft irq event time of CPU 0 only
-+    ./alisoftirqs -i 25      # show event time of IRQ 25 only
-+    ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
-+    ./alisoftirqs 1 10       # print 1 second summaries, 10 times
-+    ./alisoftirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
-diff --git a/tools/alisyscount.py b/tools/alisyscount.py
-new file mode 100755
-index 0000000..921fac7
---- /dev/null
-+++ b/tools/alisyscount.py
-@@ -0,0 +1,329 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alisyscount   Summarize syscall counts and latencies.
-+#               Enhanced version of BCC syscount.
-+#               For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alisyscount [-h] [-T] [-N] [-D] [-S] [-P] [-s syscall] [-p pid] [interval] [outputs]
-+#
-+# Copyright (c) 2015 Brendan Gregg.
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+from time import sleep, strftime
-+import argparse
-+import ctypes
-+from bcc.syscall import syscall_name, syscalls
-+
-+# arguments
-+examples = """examples:
-+    ./alisyscount            # sum syscall counts and latencies.
-+    ./alisyscount -S         # show per-syscall statistics
-+    ./alisyscount -P         # show per-thread syscall statistics
-+    ./alisyscount -D         # show syscall statistics as histograms
-+    ./alisyscount -s 0       # show syscall statistics of syscall 0 only
-+    ./alisyscount -p 25      # show syscall statistics of PID 25 only
-+    ./alisyscount 1 10       # print 1 second summaries, 10 times
-+    ./alisyscount -NT 1      # 1s summaries, nanoseconds, and timestamps
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Summarize syscall counts and latencies.",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-T", "--timestamp", action="store_true",
-+    help="include timestamp on output")
-+parser.add_argument("-N", "--nanoseconds", action="store_true",
-+    help="output in nanoseconds")
-+parser.add_argument("-S", "--syscalls", action="store_true",
-+    help="show per-syscall statistics")
-+parser.add_argument("-P", "--pids", action="store_true",
-+    help="show per-thread syscall statistics")
-+parser.add_argument("-D", "--dist", action="store_true",
-+    help="show distributions as histograms")
-+parser.add_argument("-s", "--syscall",
-+    help="show statistics of specific syscall only")
-+parser.add_argument("-p", "--pid",
-+    help="show syscall statistics of specific PID only")
-+parser.add_argument("interval", nargs="?", default=99999999,
-+    help="output interval, in seconds")
-+parser.add_argument("outputs", nargs="?", default=99999999,
-+    help="number of outputs")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+countdown = int(args.outputs)
-+if args.pid:
-+    args.pids = True
-+if args.syscall:
-+    args.syscalls = True
-+if (not args.syscalls) and (not args.pids):
-+    args.syscalls = True
-+
-+if args.nanoseconds:
-+    factor = 1
-+    label = "nsecs"
-+else:
-+    factor = 1000
-+    label = "usecs"
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+BPF_PERCPU_ARRAY(start, u64, 1);
-+
-+STORAGE
-+
-+TRACEPOINT_PROBE(raw_syscalls, sys_enter)
-+{
-+    COND_FILTER
-+
-+    u32 idx = 0;
-+    u64 ts = bpf_ktime_get_ns();
-+    start.update(&idx, &ts);
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(raw_syscalls, sys_exit)
-+{
-+    COND_FILTER
-+
-+    u32 idx = 0;
-+    u64 *tsp, delta;
-+    tsp = start.lookup(&idx);
-+    if (!tsp || *tsp == 0) {
-+        return 0;   // missed start
-+    }
-+
-+    delta = bpf_ktime_get_ns() - *tsp;
-+    u64 zero = 0;
-+    start.update(&idx, &zero);
-+
-+    u32 sysnr = args->id;
-+    u32 pid = bpf_get_current_pid_tgid();
-+
-+    UPDATE
-+
-+    return 0;
-+}
-+"""
-+
-+
-+# replace STORAGE
-+map_key = """
-+typedef struct map_key {
-+    u32 sysnr;
-+    u32 pid;
-+} map_key_t;
-+"""
-+
-+map_val = """
-+typedef struct map_val {
-+    u64 time;
-+    u64 count;
-+} map_val_t;
-+"""
-+
-+if args.dist:
-+    if args.syscalls and args.pids:
-+        storage = map_key + """
-+typedef struct map_key_slot {
-+    map_key_t key;
-+    u64 slot;
-+} map_key_slot_t;
-+"""
-+    else:
-+        storage = """
-+typedef struct map_key_slot {
-+    u64 key;
-+    u64 slot;
-+} map_key_slot_t;
-+"""
-+    storage += "BPF_HISTOGRAM(dist, map_key_slot_t);"
-+
-+else:
-+    storage = map_val
-+    if args.syscalls and args.pids:
-+        storage += map_key + "BPF_HASH(res, map_key_t, map_val_t);"
-+    else:
-+        storage += "BPF_HASH(res, u32, map_val_t);"
-+bpf_text = bpf_text.replace('STORAGE', storage)
-+
-+
-+# replace UPDATE
-+if args.dist:
-+    if args.syscalls and args.pids:
-+        key_exp = "{.sysnr = sysnr, .pid = pid,}"
-+    elif args.syscalls:
-+        key_exp = "sysnr"
-+    elif args.pids:
-+        key_exp = "pid"
-+
-+    update = """
-+    map_key_slot_t key_slot = {
-+        .key = %s,
-+        .slot = bpf_log2l(delta / FACTOR),
-+    };
-+    dist.increment(key_slot);
-+""" % key_exp
-+
-+else:
-+    if args.syscalls and args.pids:
-+        update = """
-+    map_key_t key = {
-+        .sysnr = sysnr,
-+        .pid = pid,
-+    };
-+"""
-+    elif args.syscalls:
-+        update = "u32 key = sysnr;"
-+    elif args.pids:
-+        update = "u32 key = pid;"
-+
-+    update += """
-+    map_val_t val = {};
-+    map_val_t *valp = res.lookup_or_init(&key, &val);
-+    if (valp) {
-+        valp->count += 1;
-+        valp->time += delta;
-+    }
-+"""
-+bpf_text = bpf_text.replace('UPDATE', update)
-+bpf_text = bpf_text.replace('FACTOR', str(factor))
-+
-+
-+# replace FILTER
-+sysnr_match = "args->id == %s" % args.syscall
-+pid_match = "((u32)bpf_get_current_pid_tgid()) == %s" % args.pid
-+
-+if args.syscall and args.pid:
-+    cond_match = "(%s) && (%s)" % (sysnr_match, pid_match)
-+elif args.syscall:
-+    cond_match = sysnr_match
-+elif args.pid:
-+    cond_match = pid_match
-+else:
-+    cond_match = ""
-+
-+cond_filter = "if (!(%s)) {return 0;}" % cond_match if cond_match else ""
-+bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
-+
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+print("Tracing syscall statistics... Hit Ctrl-C to end.")
-+
-+if args.dist:
-+    tab = b.get_table("dist")
-+else:
-+    tab = b.get_table("res")
-+
-+
-+def pid_to_comm(pid):
-+    try:
-+        return open("/proc/%d/comm" % pid, "r").read().rstrip("\n")
-+    except IOError:
-+        return ""
-+
-+
-+def get_pid_info(pid):
-+    return "%10d %30s" % (pid, pid_to_comm(pid))
-+
-+def get_syscall_info(sysnr):
-+    return "%10d %30s" % (sysnr, syscall_name(sysnr))
-+
-+def skip_errata(k):
-+    if args.syscalls and args.pids:
-+        sysnr = k.sysnr
-+    else:  # args.syscalls or args.pids
-+        sysnr = k.value
-+
-+    # happens occasionally, we don't need it
-+    if sysnr == 0xFFFFFFFF:
-+        return True
-+    else:
-+        return False
-+
-+
-+def print_stat(tab):
-+    if args.syscalls and args.pids:
-+        section_hdr = "%10s %30s %10s %30s" % ("SysNR", "Syscall", "PID", "Thread")
-+    elif args.pids:
-+        section_hdr = "%10s %30s" % ("PID", "Thread")
-+    else:  # args.syscalls
-+        section_hdr = "%10s %30s" % ("SysNR", "Syscall")
-+
-+    section_hdr += " %10s %10s" % ("Count", "TOTAL_" + label)
-+    print(section_hdr)
-+
-+    # k is struct map_key; v is struct map_val
-+    for k, v in sorted(tab.items(), key=lambda res: (-res[1].count, -res[1].time)):
-+        if skip_errata(k):
-+            continue
-+
-+        if args.syscalls and args.pids:
-+            record = "%40s %40s" % (get_syscall_info(k.sysnr), get_pid_info(k.pid))
-+        elif args.pids:
-+            record = "%40s" % (get_pid_info(k.value))
-+        else:  # args.syscalls
-+            record = "%40s" % (get_syscall_info(k.value))
-+
-+        record += "%10d %10d" % (v.count, v.time / factor)
-+        print(record)
-+
-+
-+def get_dist_pid_info(pid):
-+    return "PID %d (%s)" % (pid, pid_to_comm(pid))
-+
-+def get_dist_syscall_info(sysnr):
-+    return "Syscall %d (%s)" % (sysnr, syscall_name(sysnr))
-+
-+def print_dist_section(k):
-+    if args.syscalls and args.pids:
-+        return "%s %s" % (get_dist_syscall_info(k[0]), get_dist_pid_info(k[1]))
-+    elif args.pids:
-+        return "%40s" % (get_dist_pid_info(k))
-+    else:
-+        return "%40s" % (get_dist_syscall_info(k))
-+
-+def print_dist_stat(tab):
-+    section = "section" if args.syscalls and args.pids else "Thread" if args.pids else "Syscall"
-+    tab.print_log2_hist("time_" + label, section,
-+        section_print_fn=print_dist_section,
-+        bucket_fn=(lambda k: (k.sysnr, k.pid)) if args.syscalls and args.pids else None,
-+        bucket_sort_fn=lambda k: sorted(k))
-+
-+
-+# output
-+while (1):
-+    try:
-+        sleep(int(args.interval))
-+    except KeyboardInterrupt:
-+        countdown = 1
-+    print()
-+    if args.timestamp:
-+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-+
-+    if args.dist:
-+        print_dist_stat(tab)
-+    else:
-+        print_stat(tab)
-+
-+    tab.clear()
-+
-+    countdown -= 1
-+    if countdown == 0:
-+        exit()
-diff --git a/tools/alisyscount_example.txt b/tools/alisyscount_example.txt
-new file mode 100644
-index 0000000..935f2ff
---- /dev/null
-+++ b/tools/alisyscount_example.txt
-@@ -0,0 +1,199 @@
-+This program summarize syscall counts and latencies, in flavor of
-+per-CPU, per-PID, or histogram.
-+
-+There are major two categories of parameters:
-+1. one category of parameters control the sampling source, such as
-+-s[--syscall] and -p[--pid]
-+2. another category of parameters control the displaying format,
-+such as -S[--syscalls], -P[--pids] and -D[--dist]
-+
-+These two categories of parameters are Orthogonal.
-+
-+
-+Example 1.
-+
-+By default, the program will sample all syscalls of all processes.
-+Statistics of all processes on the same syscall is aggregated and
-+displayed in a per-syscall flavor.
-+
-+In this case, "alisyscount" is the same as "alisyscount -S".
-+
-+# alisyscount
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+     SysNR                        Syscall      Count TOTAL_usecs
-+        35                      nanosleep       246   23815620
-+         4                           stat       207        306
-+         0                           read        61         65
-+       321                            bpf        43        112
-+         3                          close        28         23
-+        14                 rt_sigprocmask        20          5
-+       232                     epoll_wait        20          5
-+       162                           sync        19    1901216
-+       228                  clock_gettime        18          5
-+        23                         select         9    4016487
-+         6                          lstat         9         16
-+         1                          write         8        136
-+         2                           open         8         19
-+        72                          fcntl         6          1
-+         5                          fstat         4          2
-+        16                          ioctl         3          3
-+       271                          ppoll         1     900954
-+        78                       getdents         1        355
-+        89                       readlink         1          4
-+       257                         openat         1          2
-+         9                           mmap         1          2
-+       100                          times         1          2
-+        13                   rt_sigaction         1          0
-+
-+
-+Example 2.
-+
-+Make the output displayed in a per-process flavor with "-P" parameter.
-+In this case, all syscalls of all processe are sampled as usual, while
-+all statistics of the same process is aggregated, whatever syscall it
-+runs.
-+
-+# alisyscount -P
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+       PID                         Thread      Count TOTAL_usecs
-+      3550                      AliYunDun       125    3103841
-+      3529                      AliYunDun        71    3101981
-+      3549                      AliYunDun        62    3103861
-+      3543                      AliYunDun        61    3001874
-+      3493                 aliyun-service        61    3001874
-+      3564                           sshd        48    3149751
-+      3824                alisyscount.py        46    3149398
-+      3548                      AliYunDun        40    3002530
-+      3544                      AliYunDun        32    2000233
-+      3530                      AliYunDun        31    3101971
-+      3556                      AliYunDun        31    3001000
-+      3542                      AliYunDun        30    3001918
-+      3531                      AliYunDun        30    3001871
-+      3423                AliYunDunUpdate        30    3001867
-+      3424                AliYunDunUpdate        30    3001853
-+      3475                   in:imjournal         5    1801873
-+      3551                      AliYunDun         3    1000111
-+      3422                AliYunDunUpdate         3          1
-+      3553                      AliYunDun         2    2000172
-+      3547                      AliYunDun         2    2000172
-+      3555                      AliYunDun         2    2000162
-+
-+
-+Example 3.
-+
-+You can set "-P" and "-S" parameter at the same time, in which case, the
-+output is displayed in a per-syscall and per-process flavor.
-+
-+# alisyscount -P -S
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+     SysNR                        Syscall        PID                         Thread      Count TOTAL_usecs
-+         4                           stat       3550                      AliYunDun       586        855
-+        35                      nanosleep       3549                      AliYunDun        40    2002500
-+         0                           read       3550                      AliYunDun        36         90
-+        35                      nanosleep       3548                      AliYunDun        26    1951607
-+       321                            bpf       3832                alisyscount.py        23         36
-+        35                      nanosleep       3550                      AliYunDun        22    1901390
-+        35                      nanosleep       3530
-+
-+
-+Example 4.
-+
-+Sample specified process only with "-p", and "-P" is added implicitly
-+when "-p" is specified.
-+
-+# alisyscount -p 3550
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+       PID                         Thread      Count TOTAL_usecs
-+      3550                      AliYunDun       121    3003773
-+
-+
-+Example 5.
-+
-+When both "-p" and "-S" parameters are specified, the output is displayed
-+in per-syscall and per-process flavor.
-+
-+# alisyscount -p 3550 -S
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+     SysNR                        Syscall        PID                         Thread      Count TOTAL_usecs
-+         0                           read       3550                      AliYunDun        42         16
-+        35                      nanosleep       3550                      AliYunDun        41    2052562
-+
-+
-+Example 6.
-+
-+Sample specified syscall only with "-s", and "-S" is added
-+implicitly when "-s" is specified.
-+
-+# alisyscount -s 3
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+     SysNR                        Syscall      Count TOTAL_usecs
-+         3                          close        36         15
-+
-+
-+Example 7.
-+
-+Make the output displayed in a diagram when "-D" is specified.
-+
-+# alisyscount -p 3550 -D
-+Tracing syscall statistics... Hit Ctrl-C to end.
-+^C
-+
-+Thread =                     PID 3550 (AliYunDun)
-+     time_usecs          : count     distribution
-+         0 -> 1          : 33       |****************************************|
-+         2 -> 3          : 0        |                                        |
-+         4 -> 7          : 0        |                                        |
-+         8 -> 15         : 0        |                                        |
-+        16 -> 31         : 0        |                                        |
-+        32 -> 63         : 0        |                                        |
-+        64 -> 127        : 0        |                                        |
-+       128 -> 255        : 0        |                                        |
-+       256 -> 511        : 0        |                                        |
-+       512 -> 1023       : 0        |                                        |
-+      1024 -> 2047       : 0        |                                        |
-+      2048 -> 4095       : 0        |                                        |
-+      4096 -> 8191       : 0        |                                        |
-+      8192 -> 16383      : 0        |                                        |
-+     16384 -> 32767      : 0        |                                        |
-+     32768 -> 65535      : 32       |**************************************  |
-+
-+
-+
-+USAGE message:
-+
-+# alisyscount -h
-+usage: alisyscount.py [-h] [-T] [-N] [-S] [-P] [-D] [-s SYSCALL] [-p PID]
-+                      [interval] [outputs]
-+
-+Summarize syscall counts and latencies.
-+
-+positional arguments:
-+  interval              output interval, in seconds
-+  outputs               number of outputs
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -T, --timestamp       include timestamp on output
-+  -N, --nanoseconds     output in nanoseconds
-+  -S, --syscalls        show per-syscall statistics
-+  -P, --pids            show per-thread syscall statistics
-+  -D, --dist            show distributions as histograms
-+  -s SYSCALL, --syscall SYSCALL
-+                        show statistics of specific syscall only
-+  -p PID, --pid PID     show syscall statistics of specific PID only
-+
-+examples:
-+    ./alisyscount            # sum syscall counts and latencies.
-+    ./alisyscount -S         # show per-syscall statistics
-+    ./alisyscount -P         # show per-thread syscall statistics
-+    ./alisyscount -D         # show syscall statistics as histograms
-+    ./alisyscount -s 0       # show syscall statistics of syscall 0 only
-+    ./alisyscount -p 25      # show syscall statistics of PID 25 only
-+    ./alisyscount 1 10       # print 1 second summaries, 10 times
-+    ./alisyscount -NT 1      # 1s summaries, nanoseconds, and timestamps
-diff --git a/tools/alisysdelay.py b/tools/alisysdelay.py
-new file mode 100755
-index 0000000..e4c7952
---- /dev/null
-+++ b/tools/alisysdelay.py
-@@ -0,0 +1,297 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alisysdelay Catch tasks hung in kernel mode and get its stack
-+#             For Linux, uses BCC, BPF, perf_events. Embedded C.
-+#
-+# In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
-+# may be unacceptable, because some tasks may run too long in kernel mode
-+# without calling cond_sched(). These tasks may be kernel thread, or user
-+# processes that enter kernel mode by syscall.
-+#
-+# alisysdelay can catch tasks that have been running continuously for a long time
-+# whithout scheduling, and kernel/user stackes of these tasks will be printed.
-+# The threshold can be user defined.
-+#
-+# By default CPU idle stacks are excluded by simply excluding PID 0.
-+#
-+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
-+#
-+# USAGE: alisysdelay [-h] [-d] [-a] [-f frequency] [-t threshold] [--stack-storage-size size]
-+#
-+# Copyright (c) 2016 Brendan Gregg, Netflix, Inc.
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF, PerfType, PerfSWConfig
-+import argparse
-+import errno
-+import ctypes as ct
-+
-+
-+# arg validation
-+def positive_int(val):
-+    try:
-+        ival = int(val)
-+    except ValueError:
-+        raise argparse.ArgumentTypeError("must be an integer")
-+
-+    if ival < 0:
-+        raise argparse.ArgumentTypeError("must be positive")
-+    return ival
-+
-+def positive_nonzero_int(val):
-+    ival = positive_int(val)
-+    if ival == 0:
-+        raise argparse.ArgumentTypeError("must be nonzero")
-+    return ival
-+
-+def stack_id_err(stack_id):
-+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
-+    # Such as getting kernel stack trace in userspace code
-+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
-+
-+# arguments
-+examples = """examples:
-+    ./alisysdelay             # catch tasks hung in kernel mode
-+    ./alisysdelay -f 99       # catch tasks hung in kernel mode at 99 Hertz
-+    ./alisysdelay -t 100      # catch tasks that hung in kernel mode for more than 100us
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Catch tasks hung in kernel mode and get its stack",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-f", "--frequency", type=positive_nonzero_int, default=100,
-+    help="sample frequency in Hertz "
-+        "(default %(default)s)")
-+parser.add_argument("-d", "--delimited", action="store_true",
-+    help="insert delimiter between kernel/user stacks")
-+parser.add_argument("-a", "--annotations", action="store_true",
-+    help="add _[k] annotations to kernel frames")
-+parser.add_argument("-t", "--threshold", type=positive_nonzero_int, default=50,
-+    help="set the threshold of delay in kernel mode, in us "
-+        "(default %(default)s)")
-+parser.add_argument("--stack-storage-size", type=positive_nonzero_int, default=16384,
-+    help="the number of unique stack traces that can be stored and displayed "
-+        "(default %(default)s)")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+# option logic
-+args = parser.parse_args()
-+debug = 0
-+
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+#include <uapi/linux/bpf_perf_event.h>
-+#include <asm/segment.h>
-+#include <asm/processor-flags.h>
-+#include <linux/sched.h>
-+
-+typedef struct event {
-+    u32 pid;
-+    u64 kernel_ip;
-+    int user_stack_id;
-+    int kernel_stack_id;
-+    char comm[TASK_COMM_LEN];
-+} event_t;
-+
-+BPF_PERCPU_ARRAY(start, u64, 1);
-+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
-+BPF_PERF_OUTPUT(events);
-+
-+static inline void update_start_time()
-+{
-+    u32 idx = 0;
-+    u64 ts = bpf_ktime_get_ns();
-+    start.update(&idx, &ts);
-+}
-+
-+TRACEPOINT_PROBE(raw_syscalls, sys_enter)
-+{
-+    update_start_time();
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(sched, sched_switch)
-+{
-+    update_start_time();
-+    return 0;
-+}
-+
-+int kprobe___cond_resched(struct pt_regs *ctx)
-+{
-+    update_start_time();
-+    return 0;
-+}
-+
-+// This is copy from arch/x86/asm/ptrace.h
-+static inline int user_mode_dump(struct pt_regs *regs)
-+{
-+#ifdef CONFIG_X86_32
-+    u64 cs, flags;
-+    bpf_probe_read(&cs, sizeof(u64), &regs->cs);
-+    bpf_probe_read(&flags, sizeof(u64), &regs->flags);
-+    return ((cs & SEGMENT_RPL_MASK) | (flags & X86_VM_MASK)) >= USER_RPL;
-+#else
-+    u64 cs;
-+    bpf_probe_read(&cs, sizeof(u64), &regs->cs);
-+    return !!( cs & 3);
-+#endif
-+}
-+
-+static bool need_check_delay(struct pt_regs *regs)
-+{
-+    // ignore process 0
-+    if (bpf_get_current_pid_tgid() == 0) {
-+        return 0;
-+    }
-+
-+    // skip when currently cpu is in user mode
-+    if (user_mode_dump(regs)) {
-+        return 0;
-+    }
-+
-+    return 1;
-+}
-+
-+static inline void get_stack_id(struct bpf_perf_event_data *ctx)
-+{
-+    event_t key = {.pid = bpf_get_current_pid_tgid()};
-+    bpf_get_current_comm(&key.comm, sizeof(key.comm));
-+
-+    // get stacks
-+    key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK);
-+    key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0);
-+
-+    if (key.kernel_stack_id >= 0) {
-+        // populate extras to fix the kernel stack
-+        u64 ip = PT_REGS_IP(&ctx->regs);
-+        u64 page_offset;
-+
-+        // if ip isn't sane, leave key ips as zero for later checking
-+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
-+        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
-+        page_offset = __PAGE_OFFSET_BASE;
-+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
-+        // x64, 4.17, and later
-+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
-+        page_offset = __PAGE_OFFSET_BASE_L5;
-+#else
-+        page_offset = __PAGE_OFFSET_BASE_L4;
-+#endif
-+#else
-+        // earlier x86_64 kernels, e.g., 4.6, comes here
-+        // arm64, s390, powerpc, x86_32
-+        page_offset = PAGE_OFFSET;
-+#endif
-+
-+        if (ip > page_offset) {
-+            key.kernel_ip = ip;
-+        }
-+    }
-+
-+    events.perf_submit(ctx, &key, sizeof(event_t));
-+}
-+
-+int sys_timer(struct bpf_perf_event_data *ctx) {
-+
-+    if (need_check_delay(&ctx->regs)) {
-+        u32 idx = 0;
-+        u64 *tsp = start.lookup(&idx);
-+        if (!tsp || *tsp == 0) {
-+            return 0;   // missed start
-+        }
-+
-+        if ((bpf_ktime_get_ns() - *tsp) >= (DELAY_THRESHOLD)) {
-+            get_stack_id(ctx);
-+        }
-+    }
-+    else {
-+        update_start_time();
-+    }
-+
-+    return 0;
-+}
-+"""
-+
-+# set stack storage size
-+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
-+bpf_text = bpf_text.replace('DELAY_THRESHOLD', str(args.threshold * 1000))
-+
-+
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+# initialize BPF & perf_events
-+b = BPF(text=bpf_text)
-+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
-+    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="sys_timer",
-+    sample_freq=args.frequency, pid=-1, cpu=-1, group_fd=-1)
-+
-+
-+def aksym(addr):
-+    if args.annotations:
-+        return b.ksym(addr) + "_[k]".encode()
-+    else:
-+        return b.ksym(addr)
-+
-+TASK_COMM_LEN = 16
-+class Record(ct.Structure):
-+    _fields_ = [("pid", ct.c_ulong),
-+                ("kernel_ip", ct.c_ulonglong),
-+                ("user_stack_id", ct.c_int),
-+                ("kernel_stack_id", ct.c_int),
-+                ("comm", ct.c_char * TASK_COMM_LEN)]
-+
-+
-+def print_stack(cpu, data, size):
-+    k = ct.cast(data, ct.POINTER(Record)).contents
-+
-+    user_stack = [] if k.user_stack_id < 0 else stack_traces.walk(k.user_stack_id)
-+    kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id)
-+
-+    # fix kernel stack
-+    kernel_stack = []
-+    if k.kernel_stack_id >= 0:
-+        for addr in kernel_tmp:
-+            kernel_stack.append(addr)
-+        # the later IP checking
-+        if k.kernel_ip:
-+            kernel_stack.insert(0, k.kernel_ip)
-+
-+    # print default multi-line stack output
-+    if stack_id_err(k.kernel_stack_id):
-+        print("    [Missed Kernel Stack]")
-+    else:
-+        for addr in kernel_stack:
-+            print("    %s" % aksym(addr))
-+    if args.delimited and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
-+        print("    --")
-+    if stack_id_err(k.user_stack_id):
-+        print("    [Missed User Stack]")
-+    else:
-+        for addr in user_stack:
-+            print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
-+    print("    %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid))
-+    print()
-+
-+
-+# header
-+print("Sampling at %s Hz frequency" % args.frequency, end="")
-+print("... Hit Ctrl-C to end.")
-+
-+# collect samples
-+b["events"].open_perf_buffer(print_stack)
-+stack_traces = b["stack_traces"]
-+
-+while (1):
-+    try:
-+        b.perf_buffer_poll()
-+    except KeyboardInterrupt:
-+        exit()
-diff --git a/tools/alisysdelay_example.txt b/tools/alisysdelay_example.txt
-new file mode 100644
-index 0000000..6f8f593
---- /dev/null
-+++ b/tools/alisysdelay_example.txt
-@@ -0,0 +1,75 @@
-+alisysdelay is used to catch tasks hung in kernel mode and get its stack.
-+
-+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
-+may be unacceptable, because some tasks may run too long in kernel mode
-+without calling cond_sched(). These tasks may be kernel thread, or user
-+processes that enter kernel mode by syscall.
-+
-+alisysdelay can catch tasks that have been running continuously for a long time
-+whithout scheduling, and kernel/user stackes will be printed for these tasks.
-+The threshold can be user defined.
-+
-+By default CPU idle stacks are excluded by simply excluding PID 0.
-+
-+
-+
-+For example,
-+
-+# alisysdelay -t 10
-+Sampling at 100 Hz frequency... Hit Ctrl-C to end.
-+
-+    iowrite16
-+    iowrite16
-+    vp_notify
-+    virtqueue_notify
-+    start_xmit
-+    dev_hard_start_xmit
-+    sch_direct_xmit
-+    __qdisc_run
-+    __dev_queue_xmit
-+    ip_finish_output2
-+    ip_output
-+    __tcp_transmit_skb
-+    tcp_write_xmit
-+    __tcp_push_pending_frames
-+    tcp_sendmsg_locked
-+    tcp_sendmsg
-+    sock_sendmsg
-+    sock_write_iter
-+    __vfs_write
-+    vfs_write
-+    ksys_write
-+    do_syscall_64
-+    entry_SYSCALL_64_after_hwframe
-+    __write_nocancel
-+    -                sshd (4617)
-+
-+In this case, sshd (PID 4617) is catched for staying in kernel mode
-+for more than 10us, without rescheduling.
-+
-+
-+
-+USAGE message:
-+# alisysdelay -h
-+usage: alisysdelay.py [-h] [-f FREQUENCY] [-d] [-a] [-t THRESHOLD]
-+                   [--stack-storage-size STACK_STORAGE_SIZE]
-+
-+Catch tasks hung in kernel mode and get its stack
-+
-+optional arguments:
-+  -h, --help            show this help message and exit
-+  -f FREQUENCY, --frequency FREQUENCY
-+                        sample frequency in Hertz (default 100)
-+  -d, --delimited       insert delimiter between kernel/user stacks
-+  -a, --annotations     add _[k] annotations to kernel frames
-+  -t THRESHOLD, --threshold THRESHOLD
-+                        set the threshold of delay in kernel mode, in us
-+                        (default 50)
-+  --stack-storage-size STACK_STORAGE_SIZE
-+                        the number of unique stack traces that can be stored
-+                        and displayed (default 16384)
-+
-+examples:
-+    ./alisysdelay             # catch tasks hung in kernel mode
-+    ./alisysdelay -f 99       # catch tasks hung in kernel mode at 99 Hertz
-+    ./alisysdelay -t 100      # catch tasks that hung in kernel mode for more than 100us
-diff --git a/tools/alisyslatency.py b/tools/alisyslatency.py
-new file mode 100755
-index 0000000..05bfdbd
---- /dev/null
-+++ b/tools/alisyslatency.py
-@@ -0,0 +1,171 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# alisyslatency  Summarize time spent in kernel context.
-+#                For Linux, uses BCC, eBPF.
-+#
-+# USAGE: alisyslatency [-h] [-T] [-N] [-c CPU] [-p PID] [interval] [outputs]
-+#
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+from time import sleep, strftime
-+import argparse
-+
-+# arguments
-+examples = """examples:
-+    ./alisyslatency            # sum time in kernel context per CPU
-+    ./alisyslatency -c 0       # show time in kernel context in CPU 0 only
-+    ./alisyslatency -p 25      # show time in kernel context in pid 25 only
-+    ./alisyslatency 1 10       # print 1 second summaries, 10 times
-+    ./alisyslatency -NT 1      # 1s summaries, nanoseconds, and timestamps
-+"""
-+
-+parser = argparse.ArgumentParser(
-+    description="Summarize time spent in kernel context.",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("-T", "--timestamp", action="store_true",
-+    help="include timestamp on output")
-+parser.add_argument("-N", "--nanoseconds", action="store_true",
-+    help="output in nanoseconds")
-+
-+thread_group = parser.add_mutually_exclusive_group()
-+thread_group.add_argument("-c", "--cpu",
-+    help="output statistics on specific CPU only")
-+thread_group.add_argument("-p", "--pid",
-+    help="output statistics of specific pid only")
-+
-+parser.add_argument("interval", nargs="?", default=99999999,
-+    help="output interval, in seconds")
-+parser.add_argument("outputs", nargs="?", default=99999999,
-+    help="number of outputs")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+countdown = int(args.outputs)
-+if args.nanoseconds:
-+    factor = 1
-+    label = "nsecs"
-+else:
-+    factor = 1000
-+    label = "usecs"
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+typedef struct map_val {
-+    u64 count;
-+    u64 time;
-+    u64 time_max;
-+} map_val_t;
-+
-+BPF_PERCPU_ARRAY(start, u64, 1);
-+BPF_HASH(res, u32, map_val_t);
-+
-+
-+TRACEPOINT_PROBE(context_tracking, user_exit)
-+{
-+    COND_FILTER
-+
-+    u32 idx = 0;
-+    u64 ts = bpf_ktime_get_ns();
-+    start.update(&idx, &ts);
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(context_tracking, user_enter)
-+{
-+    COND_FILTER
-+
-+    u64 *tsp, delta;
-+    u32 idx = 0;
-+    // fetch timestamp and calculate delta
-+    tsp = start.lookup(&idx);
-+    if (!tsp || *tsp == 0) {
-+        return 0;   // missed start
-+    }
-+
-+#if STATISTICS_PER_PROCESS
-+    u32 key = bpf_get_current_pid_tgid();
-+#else
-+    u32 key = bpf_get_smp_processor_id();
-+#endif
-+
-+    delta = bpf_ktime_get_ns() - *tsp;
-+    map_val_t *valp, val;
-+
-+    valp = res.lookup(&key);
-+    if (valp) {
-+        valp->count += 1;
-+        valp->time += delta;
-+        if (valp->time_max < delta) {valp->time_max = delta;}
-+    }
-+    else {
-+        val.count = 1;
-+        val.time = val.time_max = delta;
-+        res.update(&key, &val);
-+    }
-+
-+    u64 zero = 0;
-+    start.update(&idx, &zero);
-+
-+    return 0;
-+}
-+"""
-+
-+cond_exp = "1"
-+if args.cpu:
-+    cond_exp = "bpf_get_smp_processor_id() == %s" % args.cpu
-+
-+if args.pid:
-+    cond_exp = "(u32)bpf_get_current_pid_tgid() == %s" % args.pid
-+
-+cond_filter = "if (!(%s)) {return 0;}" % cond_exp
-+bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
-+
-+bpf_text = bpf_text.replace('STATISTICS_PER_PROCESS', "1" if args.cpu else "0")
-+
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+print("Tracing time in kernel context... Hit Ctrl-C to end.")
-+tab = b.get_table("res")
-+
-+
-+# output
-+while (1):
-+    try:
-+        sleep(int(args.interval))
-+    except KeyboardInterrupt:
-+        countdown = 1
-+    print()
-+    if args.timestamp:
-+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-+
-+    # print header
-+    print("%10s %10s %10s %10s" %
-+            ("PID" if args.cpu else "CPU", "Count", "TOTAL_" + label, "MAX_" + label))
-+
-+    for k, v in sorted(tab.items(), key=lambda res: res[0]):
-+        print("%10d %10d %10d %10d" %
-+            (k, v.count, v.time / factor, v.time_max / factor))
-+
-+    tab.clear()
-+
-+    countdown -= 1
-+    if countdown == 0:
-+        exit()
-diff --git a/tools/alisyslatency_example.txt b/tools/alisyslatency_example.txt
-new file mode 100644
-index 0000000..f5dd2c7
---- /dev/null
-+++ b/tools/alisyslatency_example.txt
-@@ -0,0 +1,72 @@
-+Demonstrations of alisyslatency, the Linux eBPF/bcc version.
-+
-+
-+This program traces time spent in kernel context (between
-+context_tracking:user_exit and context_tracking:user_enter),
-+and stores timing statistics in-kernel for efficiency.
-+
-+
-+**Note**  This tool needs Context tracking subsystem, which can not
-+work without support of full dynticks.
-+
-+
-+Example 1
-+
-+By default, all statistics of all processes on all CPUs are sampled,
-+and the output is displayed in a per-CPU flavor.
-+
-+# alisyslatency
-+Tracing time in kernel context... Hit Ctrl-C to end.
-+^C
-+       CPU      Count TOTAL_usecs  MAX_usecs
-+
-+
-+Example 2
-+
-+You can sample statistics on specified CPU only with "-c" option,
-+and the output will be displayed in a per-process flavor.
-+
-+# alisyslatency -c 6
-+Tracing time in kernel context... Hit Ctrl-C to end.
-+^C
-+       PID      Count TOTAL_usecs  MAX_usecs
-+
-+
-+
-+Exampel 3
-+
-+You can sample statistics of specified process only with "-p" option,
-+and the output will be displayed in a per-CPU flavor.
-+
-+
-+# alisyslatency -p 3550
-+Tracing time in kernel context... Hit Ctrl-C to end.
-+^C
-+       CPU      Count TOTAL_usecs  MAX_usecs
-+
-+
-+
-+USAGE message:
-+
-+# alisyslatency -h
-+usage: alisyslatency.py [-h] [-T] [-N] [-c CPU | -p PID] [interval] [outputs]
-+
-+Summarize time spent in kernel context.
-+
-+positional arguments:
-+  interval           output interval, in seconds
-+  outputs            number of outputs
-+
-+optional arguments:
-+  -h, --help         show this help message and exit
-+  -T, --timestamp    include timestamp on output
-+  -N, --nanoseconds  output in nanoseconds
-+  -c CPU, --cpu CPU  output statistics on specific CPU only
-+  -p PID, --pid PID  output statistics of specific pid only
-+
-+examples:
-+    ./alisyslatency            # sum time in kernel context per CPU
-+    ./alisyslatency -c 0       # show time in kernel context in CPU 0 only
-+    ./alisyslatency -p 25      # show time in kernel context in pid 25 only
-+    ./alisyslatency 1 10       # print 1 second summaries, 10 times
-+    ./alisyslatency -NT 1      # 1s summaries, nanoseconds, and timestamps
-diff --git a/tools/aliworkslower.py b/tools/aliworkslower.py
-new file mode 100755
-index 0000000..6c7fffe
---- /dev/null
-+++ b/tools/aliworkslower.py
-@@ -0,0 +1,137 @@
-+#!/usr/bin/python
-+# @lint-avoid-python-3-compatibility-imports
-+#
-+# aliworkslower    Trace high work sched latency.
-+#                  For Linux, uses BCC, eBPF.
-+#
-+# USAGE: aliworkslower [-h] [threshold]
-+#
-+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
-+# Licensed under the Apache License, Version 2.0 (the "License")
-+
-+from __future__ import print_function
-+from bcc import BPF
-+import argparse
-+import ctypes as ct
-+
-+# arguments
-+examples = """examples:
-+    ./aliworkslower            # trace work sched latency higher than 10000 us (default)
-+    ./aliworkslower 1000       # trace work sched latency higher than  1000 us
-+"""
-+parser = argparse.ArgumentParser(
-+    description="Trace high work sched latency",
-+    formatter_class=argparse.RawDescriptionHelpFormatter,
-+    epilog=examples)
-+parser.add_argument("threshold", nargs="?", type=int, default=10000,
-+    help="threshold of Q2S latency, in microsecond")
-+parser.add_argument("--ebpf", action="store_true",
-+    help=argparse.SUPPRESS)
-+
-+args = parser.parse_args()
-+debug = 0
-+
-+# define BPF program
-+bpf_text = """
-+#include <uapi/linux/ptrace.h>
-+
-+typedef struct work_timestamp {
-+    u64 time_queue_work;
-+    u64 time_activate_work;
-+} work_timestamp_t;
-+
-+typedef struct work_latency {
-+    u64 pid;    // pid of worker thread
-+    u64 func;   // function pointer of the work callback
-+    u64 q2a;    // queue_work to activate_work
-+    u64 a2s;    // activate_work to execute_start
-+} work_latency_t;
-+
-+BPF_HASH(res, u64, work_timestamp_t);
-+BPF_PERF_OUTPUT(events);
-+
-+TRACEPOINT_PROBE(workqueue, workqueue_queue_work)
-+{
-+    u64 work = (u64)args->work;
-+    work_timestamp_t stamp = {
-+        .time_queue_work =  bpf_ktime_get_ns(),
-+    };
-+    res.update(&work, &stamp);
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(workqueue, workqueue_activate_work)
-+{
-+    u64 work = (u64)args->work;
-+    work_timestamp_t *stampp = res.lookup(&work);
-+
-+    if (stampp) {
-+        stampp->time_activate_work = bpf_ktime_get_ns();
-+    }
-+
-+    return 0;
-+}
-+
-+TRACEPOINT_PROBE(workqueue, workqueue_execute_start)
-+{
-+    u64 work = (u64)args->work;
-+    u64 now = bpf_ktime_get_ns();
-+    work_timestamp_t *stampp = res.lookup(&work);
-+
-+    if (stampp) {
-+        u64 delta = now - stampp->time_queue_work;
-+        if (delta > THRESHOLD) {
-+            work_latency_t latency = {
-+                .pid = (u32)bpf_get_current_pid_tgid(),
-+                .func = (u64)args->function,
-+                .q2a = stampp->time_activate_work - stampp->time_queue_work,
-+                .a2s = now - stampp->time_activate_work,
-+            };
-+            events.perf_submit(args, &latency, sizeof(work_latency_t));
-+        }
-+    }
-+
-+    return 0;
-+}
-+"""
-+
-+bpf_text = bpf_text.replace('THRESHOLD', "%d" % (args.threshold * 1000))
-+
-+# output eBPF program C code after it is replaced, used by debugging
-+if debug or args.ebpf:
-+    print(bpf_text)
-+    if args.ebpf:
-+        exit()
-+
-+
-+class Record(ct.Structure):
-+    _fields_ = [("pid", ct.c_ulong),
-+                ("func", ct.c_ulonglong),
-+                ("q2a", ct.c_ulonglong),
-+                ("a2s", ct.c_ulonglong)]
-+
-+
-+def print_record(cpu, data, size):
-+    event = ct.cast(data, ct.POINTER(Record)).contents
-+    print("%10s %20s %25d %25d %25d" % (event.pid,
-+                                        b.ksym(event.func),
-+                                        (event.q2a + event.a2s) / 1000,
-+                                        event.q2a / 1000,
-+                                        event.a2s / 1000))
-+
-+# load BPF program
-+b = BPF(text=bpf_text)
-+print("Tracing high work sched latency... Hit Ctrl-C to end.")
-+print("%10s %20s %25s %25s %25s" %
-+    ("Worker Thread", "Work", "Total Q2S latency (us)", "Q2A latency (us)", "A2S latency (us)"))
-+
-+b["events"].open_perf_buffer(print_record)
-+
-+
-+# output
-+while (1):
-+    try:
-+        b.perf_buffer_poll()
-+    except KeyboardInterrupt:
-+        exit()
-diff --git a/tools/aliworkslower_example.txt b/tools/aliworkslower_example.txt
-new file mode 100644
-index 0000000..21775d0
---- /dev/null
-+++ b/tools/aliworkslower_example.txt
-@@ -0,0 +1,39 @@
-+aliworkslower shows high latency of work between workqueue_queue_work
-+and workqueue_execute_start.
-+
-+Users can specify a threshhold, and whenever the latency between
-+workqueue_queue_work and workqueue_execute_start exceeds this
-+threshold, related information of this work is outputed.
-+
-+For example,
-+
-+# ./aliworkslower 15
-+Tracing high work sched latency... Hit Ctrl-C to end.
-+Worker Thread                 Work    Total Q2S latency (us)          Q2A latency (us)          A2S latency (us)
-+     19783        vmstat_update                        36                         0                        35
-+     29195        vmstat_update                        16                         0                        15
-+
-+- column "Worker Thread": PID of the worker thread
-+- column "Work": work callback
-+- column "Total Q2S latency": latency between workqueue_queue_work (Q)
-+and workqueue_execute_start (S), in us
-+- column "Q2A latency": latency between workqueue_queue_work (Q)
-+and workqueue_activate_work (A), in us
-+- column "A2S latency": latency between workqueue_activate_work (A)
-+and workqueue_execute_start (S), in us
-+
-+
-+# aliworkslower --help
-+usage: aliworkslower.py [-h] [threshold]
-+
-+Trace high work sched latency
-+
-+positional arguments:
-+  threshold   threshold of Q2S latency, in microsecond
-+
-+optional arguments:
-+  -h, --help  show this help message and exit
-+
-+examples:
-+    ./aliworkslower            # trace work sched latency higher than 10000 us (default)
-+    ./aliworkslower 1000       # trace work sched latency higher than  1000 us
--- 
-2.27.0
-
diff --git a/1001-Skiping-cases-in-libbpf-tools.patch b/1001-Skiping-cases-in-libbpf-tools.patch
deleted file mode 100644
index 1afa2d8dab842f9e436b61dae8b53ebe18746402..0000000000000000000000000000000000000000
--- a/1001-Skiping-cases-in-libbpf-tools.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From f67a5908c55568e369e4ab5b36b90aeb38c93caa Mon Sep 17 00:00:00 2001
-Message-Id: <f67a5908c55568e369e4ab5b36b90aeb38c93caa.1675925685.git.llfl@linux.alibaba.com>
-From: "Kun(llfl)" <llfl@linux.alibaba.com>
-Date: Thu, 9 Feb 2023 14:53:53 +0800
-Subject: [PATCH 1001/1001] Skiping cases in libbpf-tools
-
-Signed-off-by: Kun(llfl) <llfl@linux.alibaba.com>
----
- libbpf-tools/Makefile | 1 -
- 1 file changed, 1 deletion(-)
-
-diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile
-index a48b4e2..bb4b012 100644
---- a/libbpf-tools/Makefile
-+++ b/libbpf-tools/Makefile
-@@ -57,7 +57,6 @@ APPS = \
- 	funclatency \
- 	gethostlatency \
- 	hardirqs \
--	javagc \
- 	klockstat \
- 	ksnoop \
- 	llcstat \
--- 
-2.39.0
-
diff --git a/1003-add-u64-define.patch b/1003-add-u64-define.patch
deleted file mode 100644
index 7132712ea32c9220745b99ae5517d78bc6c21bac..0000000000000000000000000000000000000000
--- a/1003-add-u64-define.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 57ef40d8219e62f791f25186077cc6fc135e246c Mon Sep 17 00:00:00 2001
-From: "songnannan.snn@alibaba-inc.com" <songnannan.snn@alibaba-inc.com>
-Date: Sun, 2 Apr 2023 07:32:35 +0800
-Subject: [PATCH 1/1] add u64 define
-
----
- libbpf-tools/wakeuptime.h | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/libbpf-tools/wakeuptime.h b/libbpf-tools/wakeuptime.h
-index 3c5376a..3f86907 100644
---- a/libbpf-tools/wakeuptime.h
-+++ b/libbpf-tools/wakeuptime.h
-@@ -5,6 +5,7 @@
- 
- #define MAX_ENTRIES 10240
- #define TASK_COMM_LEN 16
-+#define u64 unsigned long long
- 
- struct key_t {
- 	char waker[TASK_COMM_LEN];
--- 
-2.31.1
-
diff --git a/1004-fix-man-pages-installation.patch b/1004-fix-man-pages-installation.patch
deleted file mode 100644
index d975f72293d5569c7e92a7efa93827e95bae43ad..0000000000000000000000000000000000000000
--- a/1004-fix-man-pages-installation.patch
+++ /dev/null
@@ -1,16 +0,0 @@
---- bcc-0.27.0/man/man8/CMakeLists.txt.orig	2023-04-17 22:30:03.667858093 +0800
-+++ bcc-0.27.0/man/man8/CMakeLists.txt	2023-04-17 22:30:37.377858093 +0800
-@@ -1,12 +1,2 @@
--find_program(GZIP gzip)
- file(GLOB FILES *.8)
--set(GZFILES "")
--foreach(FIL ${FILES})
--  get_filename_component(NAME ${FIL} NAME)
--  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz
--    COMMAND ${GZIP} -c ${FIL} > ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz
--    DEPENDS ${FIL})
--  list(APPEND GZFILES "${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz")
--endforeach()
--add_custom_target(man ALL DEPENDS ${GZFILES})
--install(FILES ${GZFILES} DESTINATION share/bcc/man/man8)
-+install(FILES ${FILES} DESTINATION ${CMAKE_INSTALL_MANDIR}/man8)
diff --git a/bcc.spec b/bcc.spec
index 7e6a23ad8ce8949b924e1543c9d0eb1cc690cee2..46eb78e80952364f3eb01398a054a3365378f9b2 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -1,4 +1,4 @@
-%define anolis_release 5
+%define anolis_release 1
 %ifarch loongarch64
 %bcond_without ignore_lua
 %bcond_with libbpf_tools
@@ -9,17 +9,17 @@
 
 
 Name:           bcc
-Version:        0.27.0
+Version:        0.30.0
 Release:        %{anolis_release}%{?dist}
 Summary:        BPF Compiler Collection (BCC)
 License:        ASL 2.0
 URL:            https://github.com/iovisor/bcc
 Source0:        %{url}/archive/refs/tags/v%{version}.tar.gz
 
-Patch1000:      1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
-Patch1001:      1001-Skiping-cases-in-libbpf-tools.patch
-Patch1003:      1003-add-u64-define.patch
-Patch1004:      1004-fix-man-pages-installation.patch
+#Patch1000:      1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
+#Patch1001:      1001-Skiping-cases-in-libbpf-tools.patch
+#Patch1003:      1003-add-u64-define.patch
+#Patch1004:      1004-fix-man-pages-installation.patch
 
 BuildRequires:  cmake >= 2.8.12 llvm-devel clang-devel llvm-googletest
 BuildRequires:  flex bison python3-setuptools
@@ -131,10 +131,12 @@ find %{buildroot}%{_datadir}/%{name}/{tools,examples} -type f -exec \
          -e '1s=^#!/usr/bin/env python\([0-9.]\+\)\?$=#!%{__python3}=' \
          -e '1s=^#!/usr/bin/env bcc-lua$=#!/usr/bin/bcc-lua=' {} \;
 
-# Move man pages to the right location to avoid conflict with other manpages
-pushd %{buildroot}%{_mandir}/man8
-  find . -name '*.8' -exec bash -c 'mv $0 bcc-`basename $0`' {} \;
-popd
+mkdir -p %{buildroot}%{_mandir}
+mv %{buildroot}%{_datadir}/%{name}/man/* %{buildroot}%{_mandir}/
+for i in `find %{buildroot}%{_mandir} -name "*.gz"`; do
+  tname=$(basename $i)
+  rename $tname %{name}-$tname $i
+done
 
 mkdir -p %{buildroot}%{_docdir}/%{name}
 mv %{buildroot}%{_datadir}/%{name}/examples %{buildroot}%{_docdir}/%{name}/
@@ -189,6 +191,9 @@ install -p libbpf-tools/tmp-install/bin/* %{buildroot}/%{_sbindir}/
 %endif
 
 %changelog
+* Wed May 08 2024 mgb01105731 <mgb01105731@alibaba-inc.com> - 0.30.0-1
+- update to 0.30.0
+
 * Thu Mar 28 2024 Zhao Hang <wb-zh951434@alibaba-inc.com> - 0.27.0-5
 - Rebuild with python3.11
 
diff --git a/v0.27.0.tar.gz b/v0.30.0.tar.gz
similarity index 42%
rename from v0.27.0.tar.gz
rename to v0.30.0.tar.gz
index ace68f77c057b7c9ac17e8909fd6dcf7f24d5cb9..d8ddab4d0aa3238bc12594f79b9961ed99cb77b2 100644
Binary files a/v0.27.0.tar.gz and b/v0.30.0.tar.gz differ