From 574dea9f0aef288d219f3fed3277349da07dd3dc Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Thu, 25 Nov 2021 17:33:20 +0800
Subject: [PATCH 1/5] add ali enhanced tools for fs schedule and sys

Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
---
 ...i-enhanced-tools-for-fs-schedule-sys.patch | 6172 +++++++++++++++++
 bcc.spec                                      |    8 +-
 2 files changed, 6179 insertions(+), 1 deletion(-)
 create mode 100644 1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch

diff --git a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch b/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
new file mode 100644
index 0000000..e6348c7
--- /dev/null
+++ b/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
@@ -0,0 +1,6172 @@
+diff --git a/man/man8/alibiolatency.8 b/man/man8/alibiolatency.8
+new file mode 100644
+index 0000000..93f52fa
+--- /dev/null
++++ b/man/man8/alibiolatency.8
+@@ -0,0 +1,76 @@
++.TH alibiolatency 8  "2019-08-25" "USER COMMANDS"
++.SH NAME
++alibiolatency \- Summarize block device I/O latency.
++.SH SYNOPSIS
++.B alibiolatency [\-h] [\-i interval] [-d device] [\-t avg_req_time] [\-T req_time] [-r]
++.SH DESCRIPTION
++
++Alibiolatency can trace every io request's life cycle and do data filter and statistics
++accordingly. With this tool, we can clearly know io request's different stage's latency
++proportion, for example, if io request D2C(see blktrace) time take majority in total io
++request process time, it's most likely disk bottleneck has been reached.
++
++This tool uses in-kernel eBPF maps for storing timestamps for efficiency.
++
++This works by tracing various kernel block tracepoints using dynamic tracing,
++and will need updating to match any changes to these tracepoints.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++\-h
++Print usage message.
++.TP
++\-i interval
++Specify display interval, unit is second.
++.TP
++\-r
++Dump every io request life cycle.
++.TP
++\-d device
++Inspect specified device.
++.TP
++\-t avg_req_time
++Display only when average request process time is greater than this value, unit is ms.
++.TP
++\-T req_time
++Dump request life cycle when single request process time is greater than this value, unit is ms.
++.SH EXAMPLES
++.TP
++Summarize block device I/O latency(default display interval is 2 seconds):
++#
++.B alibiolatency
++.TP
++Print statistics per 5 seconds
++#
++.B alibiolatency \-i 5
++.TP
++Print only when average request process time is greater than 10ms
++#
++.B alibiolatency \-t 10
++.TP
++Print request life cycle when single request process time is greater than 20ms.
++#
++.B alibiolatency \-T 20
++.SH OVERHEAD
++This traces kernel functions and maintains in-kernel timestamps and a histogram,
++which are asynchronously copied to user-space. This method is very efficient,
++and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
++If you have a higher IOPS storage environment, test and quantify the overhead
++before use.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Xiaoguang Wang
++.SH SEE ALSO
++biosnoop(8)
+diff --git a/man/man8/aliexitsnoop.8 b/man/man8/aliexitsnoop.8
+new file mode 100644
+index 0000000..cd7215e
+--- /dev/null
++++ b/man/man8/aliexitsnoop.8
+@@ -0,0 +1,109 @@
++.TH aliexitsnoop 8  "2019-05-28" "USER COMMANDS"
++.SH NAME
++aliexitsnoop \- Trace all process termination (exit, fatal signal). Uses Linux eBPF/bcc.
++.SH SYNOPSIS
++.B aliexitsnoop [\-h] [\-t] [\-\-utc] [\-x] [\-p PID] [\-\-label LABEL]
++.SH DESCRIPTION
++aliexitsnoop traces process termination, showing the command name and reason for
++termination, either an exit or a fatal signal. This Linux tool is actually an enhanced
++version of the exitsnoop. For more details about exitsnoop, see exitsnoop_example.txt.
++Compared to the exitsnoop, aliexitsnoop not only print the exit code, pid, ppid, lifetime
++of the exiting processes, but also the stack of the exiting processes. The demand stems
++from the fact that in the production environment, some processes end up unexpectedly.
++Therefore we need to dump the stack of the target processes to find out what caused the
++target process to exit.
++
++It catches processes of all users, processes in containers, as well
++as processes that become zombie.
++
++This works by tracing the kernel do_exit() function,
++and will need updating to match any changes to this function.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-t
++Include a timestamp column.
++.TP
++\-\-utc
++Include a timestamp column, use UTC timezone.
++.TP
++\-x
++Exclude successful exits, exit( 0 )
++.TP
++\-p PID
++Trace this process ID only (filtered in-kernel).
++.TP
++\-\-label LABEL
++Label each line with LABEL (default 'exit') in first column (2nd if timestamp is present).
++.SH EXAMPLES
++.TP
++Trace all process termination
++#
++.B aliexitsnoop
++.TP
++Trace all process termination, and include timestamps:
++#
++.B aliexitsnoop \-t
++.TP
++Exclude successful exits, only include non-zero exit codes and fatal signals:
++#
++.B aliexitsnoop \-x
++.TP
++Trace PID 181 only:
++#
++.B aliexitsnoop \-p 181
++.TP
++Label each output line with 'EXIT':
++#
++.B aliexitsnoop \-\-label EXIT
++.SH FIELDS
++.TP
++TIME-TZ
++Time of process termination HH:MM:SS.sss with milliseconds, where TZ is
++the local time zone, 'UTC' with \-\-utc option.
++.TP
++LABEL
++The optional label if \-\-label option is used.  This is useful with the
++\-t option for timestamps when the output of several tracing tools is
++sorted into one combined output.
++.TP
++PCOMM
++Process/command name.
++.TP
++PID
++Process ID
++.TP
++PPID
++The process ID of the process that will be notified of PID termination.
++.TP
++TID
++Thread ID.
++.TP
++EXIT_CODE
++The exit code for exit() or the signal number for a fatal signal.
++.SH OVERHEAD
++This traces the kernel do_exit() function and prints output for each event.
++As the rate of this is generally expected to be low (< 1000/s), the overhead is also
++expected to be negligible. If you have an application that has a high rate of
++process termination, then test and understand overhead before use.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Alvin Zheng
++.SH SEE ALSO
++aliexecsnoop(8)
+diff --git a/man/man8/aliext4writeslower.8 b/man/man8/aliext4writeslower.8
+new file mode 100644
+index 0000000..007a474
+--- /dev/null
++++ b/man/man8/aliext4writeslower.8
+@@ -0,0 +1,70 @@
++.TH aliext4writeslower 8  "2019-08-30" "USER COMMANDS"
++.SH NAME
++aliext4writeslower \- Trace common ext4 write(2) slower than a threshold
++.SH SYNOPSIS
++.B aliext4writeslower [-h] [-p PID] [-d device] [min_ms]
++.SH DESCRIPTION
++
++This script traces key steps of write(2) which maybe time-consuming,
++here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction
++checkpoint or commit), page cache allocation, block allocation and
++balance dirty pages, and we also take into account sched delay.
++
++With this tool, we can clearly know write(2)'s latency proportion.
++
++This tool uses in-kernel eBPF maps for storing timestamps for efficiency.
++
++This works by tracing various kernel functions using dynamic tracing,
++and will need updating to match any changes to these tracepoints.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++\-h
++Print usage message.
++.TP
++\-i pid
++trace this PID only
++.TP
++min_ms
++Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
++.SH EXAMPLES
++.TP
++Trace write(2) slower than 10 ms (default).
++#
++.B aliext4writeslower
++.TP
++Trace write(2) slower than 5 ms.
++#
++.B aliext4writeslower 5
++.TP
++Trace PID 185 only.
++#
++.B aliext4writeslower -p 185
++.TP
++Trace /dev/sda5 which contains ext4 fs.
++#
++.B aliext4writeslower -d sda5
++.TP
++.SH OVERHEAD
++This traces kernel functions and maintains in-kernel timestamps and a histogram,
++which are asynchronously copied to user-space. This method is very efficient,
++and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
++If you have a higher IOPS storage environment, test and quantify the overhead
++before use.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Xiaoguang Wang
++.SH SEE ALSO
++biosnoop(8)
+diff --git a/man/man8/alihardirqs.8 b/man/man8/alihardirqs.8
+new file mode 100644
+index 0000000..d874e89
+--- /dev/null
++++ b/man/man8/alihardirqs.8
+@@ -0,0 +1,99 @@
++.TH alihardirqs 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alihardirqs \- Summarize hard IRQ (interrupt) event time.
++.SH SYNOPSIS
++.B alihardirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs]
++.SH DESCRIPTION
++This program summarizes hard irq statistics.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-T
++Include timestamps on output.
++.TP
++\-N
++Output in nanosecond.
++.TP
++\-D
++Show distributions as histograms.
++.TP
++\-c
++Show hardIRQ statistics on specific CPU only.
++.TP
++\-i
++Show statistics of specific hardIRQ only.
++.TP
++interval
++Output interval, in seconds.
++.TP
++outputs
++Number of outputs.
++.SH EXAMPLES
++.TP
++Summarize hardIRQ statistics:
++#
++.B alihardirqs
++.TP
++Print 1 second summaries, 10 times:
++#
++.B alihardirqs 1 10
++.TP
++Print 1s summaries, nanoseconds, and timestamps:
++#
++.B alihardirqs \-NT 1
++.TP
++Show hardIRQ statistics as histograms:
++#
++.B alihardirqs \-D
++.TP
++Show hardIRQ statistics of CPU 0 only:
++#
++.B alihardirqs \-c 0
++.TP
++Show statistics of hardIRQ 25 only:
++#
++.B alihardirqs \-i 25
++.SH FIELDS
++.TP
++CPU
++CPU index.
++.TP
++HARDIRQ
++hardIRQ number.
++.TP
++IRQHandler
++Name of hardIRQ handler.
++.TP
++Count
++Number of samples on specific CPU or hardIRQ.
++.TP
++Total_<usecs/nsecs>
++Number of time spent on specific hardIRQ.
++.TP
++MAX_<usecs/nsecs>
++Maximum of time spent on specific hardIRQ.
++.TP
++distribution
++An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
++.SH OVERHEAD
++This sampls at every irq_handler_entry/irq_handler_exit tracepoint, and in-kernel
++summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/alijbd2stat.8 b/man/man8/alijbd2stat.8
+new file mode 100644
+index 0000000..4f2de07
+--- /dev/null
++++ b/man/man8/alijbd2stat.8
+@@ -0,0 +1,61 @@
++.TH alijbd2stat 8  "2019-08-25" "USER COMMANDS"
++.SH NAME
++alijbd2stat \- Summarize filesystem jbd2 stat.
++.SH SYNOPSIS
++.B alijbd2stat [\-h] [\-d device] [\-t thresh]
++.SH DESCRIPTION
++
++Alijbd2stat can trace every jbd2 handle and every transaction commit info.
++
++If one jbd2 handle's running time is greater than user specified threshold time, Alijbd2stat
++will catch this handle's context.
++
++This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
++for efficiency.
++
++This works by tracing various kernel block tracepoints using dynamic tracing,
++and will need updating to match any changes to these tracepoints.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++\-h
++Print usage message.
++.TP
++\-d device
++Inspect specified device.
++.TP
++\-t thresh
++Show jbd2 handle's context when its run time is greater than this threshold.
++.SH EXAMPLES
++.TP
++Print all jbd2 instances stat.
++#
++.B alijbd2stat
++.TP
++Inspect specified device /dev/sda3, on which some filesystem uses jbd2.
++#
++.B alijbd2stat \-d sda3
++.TP
++Show jbd2 handle's context when it runs more than 10ms
++#
++.B alijbd2stat \-t 10
++.TP
++.SH OVERHEAD
++Test and quantify the overhead before use.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Xiaoguang Wang
++.SH SEE ALSO
++biosnoop(8)
+diff --git a/man/man8/alimutexsnoop.8 b/man/man8/alimutexsnoop.8
+new file mode 100644
+index 0000000..f0342b4
+--- /dev/null
++++ b/man/man8/alimutexsnoop.8
+@@ -0,0 +1,71 @@
++.TH alimutexsnoop 8  "2019-05-28" "USER COMMANDS"
++.SH NAME
++alimutexsnoop \- Trace the mutex that is held by the process longer than a specified time threshold. Uses Linux eBPF/bcc.
++.SH SYNOPSIS
++.B alimutexsnoop [\-h] [\-t THRESHOLD] [\-p PID]
++.SH DESCRIPTION
++alimutexsnoop monitors the mutex that is held by the process longer than
++a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes,
++it prints the pid of the process that held the mutex and the address of the
++mutex. In addition, alimutexsnoop also prints the user and kernel stack of the
++process.
++
++This works by tracing the kernel mutex_lock() and mutex_unlock() function,
++and will need updating to match any changes to this function.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-t
++Set the time threshold. The mutexes held longer than this time threshold will be traced(us).
++.TP
++\-p PID
++Trace this process ID only (filtered in-kernel).
++.SH EXAMPLES
++.TP
++Trace all mutexes that are held longer than 1000us (default):
++#
++.B alimutexsnoop
++.TP
++Trace all mutexes that are held longer than 50us:
++#
++.B alimutexsnoop \-t 50
++.TP
++Trace PID 181 only:
++#
++.B alimutexsnoop \-p 181
++.SH FIELDS
++.TP
++PCOMM
++Process/command name.
++.TP
++PID
++Process ID
++.TP
++MUTEX_ADDR
++The address of the mutex object.
++.SH OVERHEAD
++This traces the kernel mutex_lock() and mutex_unlock() function and compute the time
++length of the process holds the mutex. The overhead is positively related to the frequency
++of acquiring and releasing mutex. If you have an application that gets and releases mutexes
++frequently, then test and understand overhead before use.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Alvin Zheng
++.SH SEE ALSO
++alimutexsnoop(8)
+diff --git a/man/man8/alirunqinfo.8 b/man/man8/alirunqinfo.8
+new file mode 100644
+index 0000000..1ffa3cb
+--- /dev/null
++++ b/man/man8/alirunqinfo.8
+@@ -0,0 +1,109 @@
++.TH alirunqinfo 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alirunqinfo \- Summarize runqueue statistics.
++.SH SYNOPSIS
++.B alirunqinfo [\-h] [\-T] [\-N] [\-D] [\-C] [\-P] [\-c CPU] [\-p PID] [interval] [outputs]
++.SH DESCRIPTION
++This program summarizes task on-CPU time before being descheduled.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-T
++Include timestamps on output.
++.TP
++\-N
++Output in nanosecond.
++.TP
++\-D
++Show distributions as histograms.
++.TP
++\-C
++Show per-CPU runqueue statistics.
++.TP
++\-P
++Show per-thread runqueue statistics.
++.TP
++\-c
++Show runqueue statistics on specific CPU only.
++.TP
++\-p
++Show runqueue statistics of specific PID only.
++.TP
++interval
++Output interval, in seconds.
++.TP
++outputs
++Number of outputs.
++.SH EXAMPLES
++.TP
++Summarize runqueue statistics:
++#
++.B alirunqinfo
++.TP
++Print 1 second summaries, 10 times:
++#
++.B alirunqinfo 1 10
++.TP
++Print 1s summaries, nanoseconds, and timestamps:
++#
++.B alirunqinfo \-NT 1
++.TP
++Show per-CPU runqueue statistics:
++#
++.B alirunqinfo \-C
++.TP
++Show per-thread runqueue statistics:
++#
++.B alirunqinfo \-P
++.TP
++Show runqueue statistics as histograms:
++#
++.B alirunqinfo \-D
++.TP
++Show runqueue statistics of CPU 0 only:
++#
++.B alirunqinfo \-c 0
++.TP
++Show runqueue statistics of PID 25 only:
++#
++.B alirunqinfo \-p 25
++.SH FIELDS
++.TP
++CPU
++CPU index.
++.TP
++PID
++PID of sampled process.
++.TP
++Thread
++Name of the specified process.
++.TP
++Count
++Number of samples on specific CPU or PID.
++.TP
++Total_<usecs/nsecs>
++Number of task on-CPU time accumulated on specific CPU or PID.
++.TP
++distribution
++An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
++.SH OVERHEAD
++This sampls at every sched_switch tracepoint, and in-kernel summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/alisoftirqs.8 b/man/man8/alisoftirqs.8
+new file mode 100644
+index 0000000..1fc9c50
+--- /dev/null
++++ b/man/man8/alisoftirqs.8
+@@ -0,0 +1,99 @@
++.TH alisoftirqs 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alisoftirqs \- Summarize soft IRQ (interrupt) event time.
++.SH SYNOPSIS
++.B alisoftirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs]
++.SH DESCRIPTION
++This program summarizes soft irq statistics.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-T
++Include timestamps on output.
++.TP
++\-N
++Output in nanosecond.
++.TP
++\-D
++Show distributions as histograms.
++.TP
++\-c
++Show softIRQ statistics on specific CPU only.
++.TP
++\-i
++Show statistics of specific softIRQ only.
++.TP
++interval
++Output interval, in seconds.
++.TP
++outputs
++Number of outputs.
++.SH EXAMPLES
++.TP
++Summarize softIRQ statistics:
++#
++.B alisoftirqs
++.TP
++Print 1 second summaries, 10 times:
++#
++.B alisoftirqs 1 10
++.TP
++Print 1s summaries, nanoseconds, and timestamps:
++#
++.B alisoftirqs \-NT 1
++.TP
++Show softIRQ statistics as histograms:
++#
++.B alisoftirqs \-D
++.TP
++Show softIRQ statistics of CPU 0 only:
++#
++.B alisoftirqs \-c 0
++.TP
++Show statistics of softIRQ 25 only:
++#
++.B alisoftirqs \-i 25
++.SH FIELDS
++.TP
++CPU
++CPU index.
++.TP
++SOFTIRQ
++softIRQ number.
++.TP
++IRQHandler
++Name of softIRQ handler.
++.TP
++Count
++Number of samples on specific CPU or softIRQ.
++.TP
++Total_<usecs/nsecs>
++Number of time spent on specific softIRQ.
++.TP
++MAX_<usecs/nsecs>
++Maximum of time spent on specific softIRQ.
++.TP
++distribution
++An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
++.SH OVERHEAD
++This sampls at every softirq_entry/softirq_exit tracepoint, and in-kernel
++summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/alisyscount.8 b/man/man8/alisyscount.8
+new file mode 100644
+index 0000000..284084a
+--- /dev/null
++++ b/man/man8/alisyscount.8
+@@ -0,0 +1,112 @@
++.TH alisyscount 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alisyscount \- Summarize syscall counts and latencies.
++.SH SYNOPSIS
++.B alisyscount [\-h] [\-T] [\-N] [\-D] [\-S] [\-P] [\-s syscall] [\-p PID] [interval] [outputs]
++.SH DESCRIPTION
++This program summarizes syscall counts and latencies.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-T
++Include timestamps on output.
++.TP
++\-N
++Output in nanosecond.
++.TP
++\-D
++Show distributions as histograms.
++.TP
++\-S
++Show per-syscall statistics.
++.TP
++\-P
++Show per-thread statistics.
++.TP
++\-s
++Show statistics of specific syscall only.
++.TP
++\-p
++Show syscall statistics of specific PID only.
++.TP
++interval
++Output interval, in seconds.
++.TP
++outputs
++Number of outputs.
++.SH EXAMPLES
++.TP
++Summarize syscall statistics:
++#
++.B alisyscount
++.TP
++Print 1 second summaries, 10 times:
++#
++.B alisyscount 1 10
++.TP
++Print 1s summaries, nanoseconds, and timestamps:
++#
++.B alisyscount \-NT 1
++.TP
++Show syscall statistics as histograms:
++#
++.B alisyscount \-D
++.TP
++Show per-syscall statistics:
++#
++.B alisyscount \-S
++.TP
++Show per-thread syscall statistics:
++#
++.B alisyscount \-P
++.TP
++Show statistics of syscall 25 only:
++#
++.B alisyscount \-s 25
++.TP
++Show syscall statistics of PID 1213 only:
++#
++.B alisyscount \-p 1213
++.SH FIELDS
++.TP
++SysNR
++syscall number.
++.TP
++Syscall
++Name of syscall.
++.TP
++PID
++PID of specific process.
++.TP
++Thread
++Name of specific process.
++.TP
++Count
++Number of samples on specific PID or syscall.
++.TP
++Total_<usecs/nsecs>
++Number of time spent on specific syscall.
++.TP
++distribution
++An ASCII bar chart to visualize the distribution (Total_<usecs/nsecs> column)
++.SH OVERHEAD
++This sampls at every sys_enter/sys_exit tracepoint, and in-kernel summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/alisysdelay.8 b/man/man8/alisysdelay.8
+new file mode 100644
+index 0000000..a7d91aa
+--- /dev/null
++++ b/man/man8/alisysdelay.8
+@@ -0,0 +1,71 @@
++.TH alisysdelay 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alisysdelay \- Catches tasks hung in kernel mode and get its stack.
++.SH SYNOPSIS
++.B alisysdelay [\-h] [\-d] [\-a] [\-f frequency] [\-t threshold] [\-\-stack\-storage\-size size]
++.SH DESCRIPTION
++This program catches tasks hung in kernel mode and get its stack.
++
++In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
++may be unacceptable, because some tasks may run too long in kernel mode
++without calling cond_sched(). These tasks may be kernel thread, or user
++processes that enter kernel mode by syscall.
++
++alisysdelay can catch tasks that have been running continuously for a long time
++whithout scheduling, and kernel/user stackes of these tasks will be printed.
++The threshold can be user defined.
++
++By default CPU idle stacks are excluded by simply excluding PID 0.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-d
++Insert delimiter between kernel/user stacks.
++.TP
++\-a
++Add _[k] annotations to kernel frames.
++.TP
++\-f
++Sample frequency in Hertz.
++.TP
++\-t
++Set the threshold of delay in kernel mode, in us.
++.TP
++\-\-stack\-storage\-size
++The number of unique stack traces that can be stored and displayed.
++.SH EXAMPLES
++.TP
++Catches tasks hung in kernel mode and get its stack:
++#
++.B alisysdelay
++.TP
++Catches tasks hung in kernel mode at 99 Hz:
++#
++.B alisysdelay \-f 99
++.TP
++Catches tasks that hung in kernel mode for more than 100us:
++#
++.B alisysdelay \-t 100
++.SH FIELDS
++.SH OVERHEAD
++This sampls at every sys_enter/sched_switch tracepoint and _cond_resched kprobe,
++and in-kernel summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/alisyslatency.8 b/man/man8/alisyslatency.8
+new file mode 100644
+index 0000000..854b7a0
+--- /dev/null
++++ b/man/man8/alisyslatency.8
+@@ -0,0 +1,85 @@
++.TH alisyslatency 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++alisyslatency \- Summarize time spent in kernel context..
++.SH SYNOPSIS
++.B alisyslatency [\-h] [\-T] [\-N] [\-c CPU] [\-p PID] [interval] [outputs]
++.SH DESCRIPTION
++This program summarizes time spent in kernel context..
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++\-T
++Include timestamps on output.
++.TP
++\-N
++Output in nanosecond.
++.TP
++\-c
++Show statistics on specific CPU only.
++.TP
++\-p
++Show statistics of specific PID only.
++.TP
++interval
++Output interval, in seconds.
++.TP
++outputs
++Number of outputs.
++.SH EXAMPLES
++.TP
++Summarize time in kernel context per CPU:
++#
++.B alisyslatency
++.TP
++Print 1 second summaries, 10 times:
++#
++.B alisyslatency 1 10
++.TP
++Print 1s summaries, nanoseconds, and timestamps:
++#
++.B alisyslatency \-NT 1
++.TP
++Show time in kernel context in CPU 0 only:
++#
++.B alisyslatency \-c 0
++.TP
++Show time in kernel context in pid 25 only:
++#
++.B alisyslatency \-p 25
++.SH FIELDS
++.TP
++CPU
++CPU index.
++.TP
++PID
++PID of sampled process.
++.TP
++Count
++Number of samples on specific CPU or PID.
++.TP
++Total_<usecs/nsecs>
++Number of time spent on specific PID.
++.TP
++MAX_<usecs/nsecs>
++Maximum of time spent on specific PID.
++.SH OVERHEAD
++This sampls at every user_exit/user_enter tracepoint, and in-kernel summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/man/man8/aliworkslower.8 b/man/man8/aliworkslower.8
+new file mode 100644
+index 0000000..4dac8fb
+--- /dev/null
++++ b/man/man8/aliworkslower.8
+@@ -0,0 +1,59 @@
++.TH aliworkslower 8  "2019-7-25" "USER COMMANDS"
++.SH NAME
++aliworkslower \- Trace high work sched latency.
++.SH SYNOPSIS
++.B aliworkslower [\-h] [threshold]
++.SH DESCRIPTION
++This program traces high work sched latency.
++
++Since this uses BPF, only the root user can use this tool.
++.SH REQUIREMENTS
++CONFIG_BPF and bcc.
++.SH OPTIONS
++.TP
++\-h
++Print usage message.
++.TP
++[threshold]
++Set threshold of Q2S latency, in microsecond.
++.SH EXAMPLES
++.TP
++Trace work sched latency higher than 10000 us (default):
++#
++.B aliworkslower
++.TP
++Trace work sched latency higher than  1000 us:
++#
++.B aliworkslower 1000
++.SH FIELDS
++.TP
++Worker Thread
++PID of worker thread.
++.TP
++Work
++Work callback.
++.TP
++Total Q2S latency
++Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us.
++.TP
++Q2A latency
++Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us.
++.TP
++A2S latency
++Latency between workqueue_activate_work (A) and workqueue_execute_start (S), in us.
++.SH OVERHEAD
++This sampls at every workqueue_queue_work/workqueue_activate_work/
++workqueue_execute_start tracepoint, and in-kernel summaries.
++.SH SOURCE
++This is from bcc.
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a companion _examples.txt file containing
++example usage, output, and commentary for this tool.
++.SH OS
++Linux
++.SH STABILITY
++Unstable - in development.
++.SH AUTHOR
++Jeffle Xu
+diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py
+index 9222e37..db51793 100755
+--- a/tests/python/test_tools_smoke.py
++++ b/tests/python/test_tools_smoke.py
+@@ -421,5 +421,53 @@ class SmokeTests(TestCase):
+         # Fails to attach the probe if zfs is not installed.
+         pass
+ 
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alirunqinfo(self):
++        self.run_with_duration("alirunqinfo.py 1 1")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alihardirqs(self):
++        self.run_with_duration("alihardirqs.py 1 1")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alisoftirqs(self):
++        self.run_with_duration("alisoftirqs.py 1 1")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alisyscount(self):
++        self.run_with_duration("alisyscount.py 1 1")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alisysdelay(self):
++        self.run_with_int("alisysdelay.py", timeout=10, kill_timeout=15)
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alisyslatency(self):
++        self.run_with_duration("alisyslatency.py 1 1")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_aliworkslower(self):
++        self.run_with_int("aliworkslower.py", timeout=10, kill_timeout=15)
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_aliexitsnoop(self):
++        self.run_with_int("aliexitsnoop.py ")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alimutexsnoop(self):
++        self.run_with_int("alimutexsnoop.py ")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alibiolatency(self):
++        self.run_with_int("alibiolatency.py ")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alijbd2stat(self):
++        self.run_with_int("alijbd2stat.py ")
++
++    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
++    def test_alijbd2stat(self):
++        self.run_with_int("aliext4writeslower.py ")
++
+ if __name__ == "__main__":
+     main()
+diff --git a/tools/alibiolatency.py b/tools/alibiolatency.py
+new file mode 100755
+index 0000000..07f98d3
+--- /dev/null
++++ b/tools/alibiolatency.py
+@@ -0,0 +1,532 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alibiolatency    Summarize block device I/O latency.
++#       For Linux, uses BCC, eBPF.
++#
++# USAGE: alibiolatency [-h] [-i interval] [-d device] [-t avg_req_time] [-T req_time] [-r]
++#
++# Copyright (c) 2019-2021 Alibaba Group.
++# Licensed under the Apache License, Version 2.0 (the "License")
++#
++# 2019/07/01 Xiaoguang Wang Created this.
++
++from __future__ import print_function
++from bcc import BPF
++import ctypes as ct
++import argparse
++import time
++
++# arguments
++examples = """examples:
++    ./alibiolatency          # summarize block I/O latency(default display interval is 2s)
++    ./alibiolatency -d sda3  # inspect specified device /dev/sda3
++    ./alibiolatency -i 2     # specify display interval, 2s
++    ./alibiolatency -t 10    # display only when average request process time is greater than 10ms
++    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
++    ./alibiolatency -r       # dump every io request life cycle
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize block device I/O latency",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-d", "--device", help="inspect specified device")
++parser.add_argument("-i", "--dis_interval", nargs="?", default=2,
++    help="specify display interval")
++parser.add_argument("-t", "--avg_threshold_time", nargs="?", default=0,
++    help="display only when average request process time is greater than this value")
++parser.add_argument("-T", "--threshold_time", nargs="?", default=999999999,
++    help="dump request life cycle when single request process time is greater than this value")
++parser.add_argument("-r", "--dump_raw", action="store_true", help="dump every io request life cycle")
++parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
++args = parser.parse_args()
++threshold_time = int(args.threshold_time)
++avg_threshold_time = int(args.avg_threshold_time)
++dis_interval = int(args.dis_interval)
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++#include <linux/blkdev.h>
++
++struct req_stat {
++	dev_t dev;
++	u64 queue_us;
++	u64 getrq_us;
++	u64 insert_us;
++	u64 issue_us;
++	u64 completion_us;
++	u64 sector;
++	u32 num_sectors;
++	char rwbs[8];
++};
++
++typedef struct disk_key {
++	dev_t dev;
++	u64 sector;
++} disk_key_t;
++
++BPF_HASH(io_requests, disk_key_t, struct req_stat);
++BPF_PERF_OUTPUT(req_stat);
++
++
++TRACEPOINT_PROBE(block, block_bio_queue)
++{
++	disk_key_t key = {};
++	struct req_stat s = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	s.dev = key.dev;
++	s.queue_us = bpf_ktime_get_ns() / 1000;
++	bpf_probe_read(s.rwbs, 8, args->rwbs);
++	io_requests.update(&key, &s);
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_getrq)
++{
++	struct req_stat *r;
++	disk_key_t key = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	r = io_requests.lookup(&key);
++	if (r == NULL)
++		return 0;
++
++	r->getrq_us = bpf_ktime_get_ns() / 1000;
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_bio_backmerge)
++{
++	struct req_stat *r;
++	disk_key_t key = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	io_requests.delete(&key);
++
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_bio_frontmerge)
++{
++	struct req_stat *r;
++	struct req_stat *r2;
++	disk_key_t key = {};
++	disk_key_t key2 = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector + args->nr_sector;
++	r = io_requests.lookup(&key);
++	if (r == NULL)
++		return 0;
++
++	key2.dev = args->dev;
++	key2.sector = args->sector;
++	r2 = io_requests.lookup(&key2);
++	if (r2 == NULL) {
++		io_requests.delete(&key);
++		return 0;
++	}
++
++	// keep old value.
++	r2->queue_us = r->queue_us;
++	r2->getrq_us = r->getrq_us;
++	r2->insert_us = r->insert_us;
++	r2->issue_us = r->issue_us;
++	io_requests.delete(&key);
++
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_rq_insert)
++{
++	struct req_stat *r;
++	disk_key_t key = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	r = io_requests.lookup(&key);
++	if (r == NULL)
++		return 0;
++
++	r->insert_us = bpf_ktime_get_ns() / 1000;
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_rq_issue)
++{
++	struct req_stat *r;
++	disk_key_t key = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	r = io_requests.lookup(&key);
++	if (r == NULL)
++		return 0;
++
++	r->issue_us = bpf_ktime_get_ns() / 1000;
++	r->num_sectors = args->nr_sector;
++	return 0;
++}
++
++TRACEPOINT_PROBE(block, block_rq_complete)
++{
++	struct req_stat *r;
++	struct req_stat val = {};
++	disk_key_t key = {};
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = args->dev;
++	key.sector = args->sector;
++	r = io_requests.lookup(&key);
++	if (r == NULL)
++		return 0;
++
++	/* could be happen.
++	if (r->num_sectors != args->nr_sector)
++		bpf_trace_printk("here");
++	*/
++	r->completion_us = bpf_ktime_get_ns() / 1000;
++	/* FIXME: should we use r? */
++	//val = *r;
++	val.dev = r->dev;
++	val.queue_us = r->queue_us;
++	val.getrq_us = r->getrq_us;
++	val.insert_us = r->insert_us;
++	val.issue_us = r->issue_us;
++	val.completion_us = r->completion_us;
++	val.sector = args->sector;
++	val.num_sectors = args->nr_sector;
++	memcpy(val.rwbs, r->rwbs, 8);
++
++	io_requests.delete(&key);
++
++	req_stat.perf_submit(args, &val, sizeof(struct req_stat));
++	return 0;
++}
++
++int trace_elv_merge_requests(struct pt_regs *ctx, struct request_queue *q,
++		struct request *rq, struct request *next)
++{
++	disk_key_t key = {};
++	dev_t dev = (next->rq_disk)->part0.__dev.devt;
++
++	if (FILTER_DEV)
++		return 0;
++
++	key.dev = dev;
++	key.sector = next->__sector;
++	io_requests.delete(&key);
++
++	return 0;
++}
++
++"""
++
++dev_name={}
++
++def init_dev_name():
++        global dev_name
++
++        f = open("/proc/partitions")
++        for line in f.readlines():
++                line = line.strip()
++                if not len(line):
++                        continue;
++
++                str = line.split()
++                if str[0].isalpha():
++                        continue
++                dev_num = (int(str[0]) << 20) + int(str[1])
++                dev_name[dev_num] = str[3]
++        f.close()
++
++init_dev_name()
++name2devid={v:k for k,v in dev_name.items()}
++
++if args.device:
++	bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
++else:
++	bpf_text = bpf_text.replace('FILTER_DEV', '0')
++
++if args.dump_raw:
++	dump_raw = 1
++else:
++	dump_raw = 0
++
++# code substitutions
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++# load BPF program
++b = BPF(text=bpf_text)
++b.attach_kprobe(event="elv_merge_requests", fn_name="trace_elv_merge_requests")
++
++print("Tracing block device I/O... Hit Ctrl-C to end.")
++
++class Data(ct.Structure):
++	_fields_ = [
++		("dev", ct.c_uint),
++		("queue_us", ct.c_ulonglong),
++		("getrq_us", ct.c_ulonglong),
++		("insert_us", ct.c_ulonglong),
++		("issue_us", ct.c_ulonglong),
++		("completion_us", ct.c_ulonglong),
++		("sector", ct.c_ulonglong),
++		("num_sectors", ct.c_uint),
++		("rwbs", ct.c_char * 8)
++    ]
++
++io_stats={}
++last_dis_time=0
++
++class req_stats:
++	def __init__(self, dev):
++		self.dev=dev
++		self.sectors=[0, 0, 0]
++		self.num_reqs=[0, 0, 0]
++		self.lat=[0, 0, 0]
++		self.lat_max=[0, 0, 0]
++		self.lat_min=[99999999, 99999999, 99999999]
++		self.i2d_lat=[0, 0, 0]
++		self.i2d_lat_max=[0, 0, 0]
++		self.i2d_lat_min=[99999999, 99999999, 99999999]
++		self.d2c_lat=[0, 0, 0]
++		self.d2c_lat_max=[0,0,0]
++		self.d2c_lat_min=[99999999, 99999999, 99999999]
++
++	def update_req_stats(self, io_type, num_sectors, lat, i2d_lat, d2c_lat):
++		self.num_reqs[io_type] += 1
++		self.sectors[io_type] += num_sectors
++		self.lat[io_type] += lat
++		self.i2d_lat[io_type] += i2d_lat
++		self.d2c_lat[io_type] += d2c_lat
++
++		if lat > self.lat_max[io_type]:
++			self.lat_max[io_type] = lat;
++
++		if lat < self.lat_min[io_type]:
++			self.lat_min[io_type] = lat;
++
++		if i2d_lat > self.i2d_lat_max[io_type]:
++			self.i2d_lat_max[io_type] = i2d_lat
++
++		if i2d_lat < self.i2d_lat_min[io_type]:
++			self.i2d_lat_min[io_type] = i2d_lat
++
++		if d2c_lat > self.d2c_lat_max[io_type]:
++			self.d2c_lat_max[io_type] = d2c_lat
++
++		if d2c_lat < self.d2c_lat_min[io_type]:
++			self.d2c_lat_min[io_type] = d2c_lat
++
++	def display_req_stats(self, time_interval):
++		global avg_threshold_time
++
++		ti = time_interval / 1000.0 / 1000.0
++
++		print("dev: %s" % dev_name[self.dev])
++		for io_type in (0, 1, 2):
++			if self.num_reqs[io_type] == 0:
++				continue
++			total_reqs = self.num_reqs[io_type]
++
++			iops = int(total_reqs / ti)
++			avg_lat = self.lat[io_type] / 1000.0 / total_reqs
++			if avg_lat <  avg_threshold_time:
++				continue;
++
++			speed = self.sectors[io_type] * 512 / 1024 / 1024 / ti
++			avg_req_sz = self.sectors[io_type] * 512 / 1024 / total_reqs
++
++			i2d_avg = self.i2d_lat[io_type] / 1000.0 / total_reqs
++			i2d_percent = int(i2d_avg / avg_lat * 100)
++
++			d2c_avg = self.d2c_lat[io_type] / 1000.0 / total_reqs
++			d2c_percent = int(d2c_avg / avg_lat * 100)
++
++			if io_type == 0:
++				io_type_str="READ"
++			elif io_type == 1:
++				io_type_str="WRITE"
++			else:
++				io_type_str="DISCARD"
++
++			print("  %s iops:%d avg:%.2f min:%.2f max:%.2f speed:%.2fMB/s avgrq-sz:%dKB" %
++			      (io_type_str, iops, avg_lat, self.lat_min[io_type] / 1000.0,
++			      self.lat_max[io_type] / 1000.0, speed, avg_req_sz))
++
++			print("  [i2d_avg: %.2f percent:%d min:%.2f max:%.2f" % (i2d_avg,
++			      i2d_percent, self.i2d_lat_min[io_type] / 1000.0,
++			      self.i2d_lat_max[io_type] / 1000.0))
++
++			print("  [d2c_avg: %.2f percent:%d min:%.2f max:%.2f" % (d2c_avg,
++			      d2c_percent, self.d2c_lat_min[io_type] / 1000.0,
++			      self.d2c_lat_max[io_type] / 1000.0))
++
++
++def get_io_type(rwbs):
++	# please see logic in kernel func blk_fill_rwbs()
++	if rwbs[0] == 'R' or rwbs[0:2] == "FR":
++		return 0;
++	elif rwbs[0] == 'W' or rwbs[0:2] == "FW":
++		return 1;
++	return 2
++
++def is_exception_req(event):
++	# we count this as a abnomal request...
++	# Currently I still hasn't figured out in which case getrq_us
++	# will be zero...
++	if event.getrq_us == 0:
++		print("dev:%s act:%s sector:%lu bytes:%u exception req Q:%lu G:%lu I:%lu D:%lu C:%lu" % (
++		      dev_name[event.dev], event.rwbs, event.sector, event.num_sectors, event.queue_us,
++		      event.getrq_us, event.insert_us, event.issue_us, event.completion_us))
++
++def dump_raw_trace_event(dev, act, ts, event):
++	print("%s %s %lu sector:%lu bytes:%u act:%s" % (dev, act, ts / 1000,
++	      event.sector, event.num_sectors, event.rwbs))
++
++def dump_timeout_req(event):
++	dev = dev_name[event.dev]
++	localtime = time.asctime( time.localtime(time.time()))
++
++	print("%s" % localtime)
++	print("timeouted request sector:%lu bytes:%u req time:%lu" % (event.sector,
++	      event.num_sectors, (event.completion_us - event.queue_us) / 1000))
++	dump_raw_trace_event(dev, "Q", event.queue_us, event)
++	dump_raw_trace_event(dev, "G", event.getrq_us, event)
++	dump_raw_trace_event(dev, "I", event.insert_us, event)
++	dump_raw_trace_event(dev, "D", event.issue_us, event)
++	dump_raw_trace_event(dev, "C", event.completion_us, event)
++
++def dump_req_life_cycle(event):
++	dev = dev_name[event.dev]
++	localtime = time.asctime( time.localtime(time.time()))
++
++	print("%s" % localtime)
++	print("raw request event sector:%lu bytes:%u req time:%lu" % (event.sector,
++	      event.num_sectors, (event.completion_us - event.queue_us) / 1000))
++	dump_raw_trace_event(dev, "Q", event.queue_us, event)
++	dump_raw_trace_event(dev, "G", event.getrq_us, event)
++	dump_raw_trace_event(dev, "I", event.insert_us, event)
++	dump_raw_trace_event(dev, "D", event.issue_us, event)
++	dump_raw_trace_event(dev, "C", event.completion_us, event)
++
++# process event
++def print_event(cpu, data, size):
++	global io_stats
++	global last_dis_time
++
++	event = ct.cast(data, ct.POINTER(Data)).contents
++
++	is_exception_req(event)
++
++	if dump_raw != 0:
++		dump_req_life_cycle(event)
++
++	lat = event.completion_us - event.queue_us;
++
++	if event.issue_us != 0:
++		d2c_lat = event.completion_us - event.issue_us
++	elif event.insert_us != 0:
++		d2c_lat = event.completion_us - event.insert_us
++	elif event.getrq_us != 0:
++		d2c_lat = event.completion_us - event.getrq_us
++	else:
++		d2c_lat = lat
++
++	# Note: this could happen...
++	# In kernel 4.9, for example, scsi_request_fn calls blk_peek_request()
++	# to fetch a request, but if later scsi driver could not process this
++	# request(see not_ready lable in scsi_request_fn), it will call
++	# blk_requeue_request() to put back this request to block elevator, now
++	# insert_us will be greater than issue_us.
++	#
++	# Later scsi_request_fn will pick this request again, but still it could
++	# not handle this request, see codes in scsi_request_fn
++	#	if (!scsi_dev_queue_ready(q, sdev))
++	#		break;
++	# then this request will still be in block elevator, but with RQF_STARTED
++	# marked.
++	#
++	# Finally scsi_request_fn calls blk_peek_request() to process this request,
++	# because RQF_STARTED is set, and this time blk_peek_request won't call
++	# trace_block_rq_issue(), then issue_us also won't be updated.
++	# Now insert_us is greater than issue_us.
++	if event.issue_us <  event.insert_us:
++		print("lege %u %s sector:%lu bytes:%u %lu %lu" % (dev_name[event.dev],
++		      event.rwbs, event.sector, event.num_sectors, event.insert_us,
++		      event.issue_us))
++
++	if event.insert_us != 0 and event.issue_us > event.insert_us:
++		i2d_lat = event.issue_us - event.insert_us;
++	else:
++		i2d_lat = 0
++	io_type = get_io_type(event.rwbs)
++
++	if last_dis_time == 0:
++		last_dis_time = event.completion_us
++
++	if not io_stats.has_key(event.dev):
++		io_stats[event.dev] = req_stats(event.dev)
++
++	if lat > threshold_time * 1000:
++		dump_timeout_req(event)
++
++	io_stats[event.dev].update_req_stats(io_type, event.num_sectors, lat, i2d_lat, d2c_lat)
++	if event.completion_us > last_dis_time and (event.completion_us - last_dis_time) > dis_interval * 1000 * 1000:
++		time_interval = event.completion_us - last_dis_time;
++		localtime = time.asctime( time.localtime(time.time()))
++
++		keys = io_stats.keys()
++		keys.sort()
++		print("%s" % localtime)
++		for k in keys:
++			io_stats[k].display_req_stats(time_interval)
++		last_dis_time = event.completion_us
++
++		print("\n")
++		io_stats.clear()
++
++
++b["req_stat"].open_perf_buffer(print_event, page_cnt=64)
++while 1:
++    try:
++        b.perf_buffer_poll()
++
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/alibiolatency_example.txt b/tools/alibiolatency_example.txt
+new file mode 100644
+index 0000000..70afc6d
+--- /dev/null
++++ b/tools/alibiolatency_example.txt
+@@ -0,0 +1,119 @@
++Demonstrations of alibiolatency.
++
++Alibiolatency can trace every io request's life cycle and do data filter and statistics
++accordingly. With this tool, we can clearly know io request's different stage's latency
++proportion, for example, if io request D2C(see blktrace) time take majority in total io
++request process time, it's most likely disk bottleneck has been reached.
++
++Usage:
++alibiolatency [-h] [-d DEVICE] [-i [DIS_INTERVAL]] [-t [AVG_THRESHOLD_TIME]] [-T [THRESHOLD_TIME]] [-r]
++
++Summarize block device I/O latency
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -d DEVICE, --device DEVICE
++                        inspect specified device
++  -i [DIS_INTERVAL], --dis_interval [DIS_INTERVAL]
++                        specify display interval
++  -t [AVG_THRESHOLD_TIME], --avg_threshold_time [AVG_THRESHOLD_TIME]
++                        display only when average request process time is
++                        greater than this value
++  -T [THRESHOLD_TIME], --threshold_time [THRESHOLD_TIME]
++                        dump request life cycle when single request process
++                        time is greater than this value
++  -r, --dump_raw        dump every io request life cycle
++
++examples:
++    ./alibiolatency          # summarize block I/O latency(default display interval is 2s)
++    ./alibiolatency -d sda3  # inspect specified device /dev/sda3
++    ./alibiolatency -i 2     # specify display interval, 2s
++    ./alibiolatency -t 10    # display only when average request process time is greater than 10ms
++    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
++    ./alibiolatency -T 20    # dump request life cycle when single request process time is greater than 20ms
++    ./alibiolatency -r       # dump every io request life cycle
++
++
++Output example:
++
++Example 1:
++# Summarize block device I/O latency periodically.
++python alibiolatency.py
++
++Wed Aug 28 19:13:00 2019
++dev: sda
++  READ iops:32 avg:15.36 min:0.02 max:143.51 speed:0.00MB/s avgrq-sz:4KB
++  [i2d_avg: 0.00 percent:0 min:0.00 max:0.01
++  [d2c_avg: 15.26 percent:99 min:0.02 max:143.51
++  WRITE iops:2 avg:0.06 min:0.04 max:0.09 speed:0.00MB/s avgrq-sz:24KB
++  [i2d_avg: 0.00 percent:2 min:0.00 max:0.00
++  [d2c_avg: 0.05 percent:76 min:0.03 max:0.06
++
++
++Wed Aug 28 19:13:03 2019
++dev: sda
++  READ iops:62 avg:2.03 min:0.03 max:16.08 speed:0.68MB/s avgrq-sz:14KB
++  [i2d_avg: 0.00 percent:0 min:0.00 max:0.00
++  [d2c_avg: 1.99 percent:97 min:0.03 max:16.07
++  WRITE iops:6 avg:0.06 min:0.03 max:0.12 speed:0.00MB/s avgrq-sz:10KB
++  [i2d_avg: 0.00 percent:5 min:0.00 max:0.01
++  [d2c_avg: 0.04 percent:76 min:0.03 max:0.07
++
++Example 2:
++# Summarize block device I/O latency periodically, and will print io request's
++# complete life cycle when this io request's process time is greater than 20ms.
++python alibiolatency.py  -T 20 # Summarize block device I/O latency periodically.
++
++timeouted request sector:269555712 bytes:1024 req time:379
++sda Q 3102021522 sector:269555712 bytes:1024 act:W
++sda G 3102021522 sector:269555712 bytes:1024 act:W
++sda I 3102021522 sector:269555712 bytes:1024 act:W
++sda D 3102021522 sector:269555712 bytes:1024 act:W
++sda C 3102021902 sector:269555712 bytes:1024 act:W
++Wed Aug 28 19:28:44 2019
++timeouted request sector:269556736 bytes:1024 req time:381
++sda Q 3102021522 sector:269556736 bytes:1024 act:W
++sda G 3102021522 sector:269556736 bytes:1024 act:W
++sda I 3102021522 sector:269556736 bytes:1024 act:W
++sda D 3102021522 sector:269556736 bytes:1024 act:W
++sda C 3102021904 sector:269556736 bytes:1024 act:W
++Wed Aug 28 19:28:45 2019
++dev: sda
++  READ iops:0 avg:0.07 min:0.07 max:0.07 speed:0.00MB/s avgrq-sz:4KB
++  [i2d_avg: 0.01 percent:17 min:0.01 max:0.01
++  [d2c_avg: 0.05 percent:73 min:0.05 max:0.05
++  WRITE iops:173 avg:339.79 min:0.04 max:501.72 speed:69.09MB/s avgrq-sz:408KB
++  [i2d_avg: 0.02 percent:0 min:0.00 max:0.06
++  [d2c_avg: 339.16 percent:99 min:0.04 max:501.67
++
++Example 3:
++# Summarize block device I/O latency periodically, and will print every io
++# request's complete life cycle.
++python alibiolatency.py  -r
++
++Wed Aug 28 19:36:40 2019
++raw request event sector:52903080 bytes:8 req time:0
++sda Q 3102498371 sector:52903080 bytes:8 act:WS
++sda G 3102498371 sector:52903080 bytes:8 act:WS
++sda I 3102498371 sector:52903080 bytes:8 act:WS
++sda D 3102498371 sector:52903080 bytes:8 act:WS
++sda C 3102498371 sector:52903080 bytes:8 act:WS
++Wed Aug 28 19:36:40 2019
++raw request event sector:2157826760 bytes:8 req time:0
++sda Q 3102498785 sector:2157826760 bytes:8 act:W
++sda G 3102498785 sector:2157826760 bytes:8 act:W
++sda I 3102498785 sector:2157826760 bytes:8 act:W
++sda D 3102498785 sector:2157826760 bytes:8 act:W
++sda C 3102498785 sector:2157826760 bytes:8 act:W
++Wed Aug 28 19:36:42 2019
++raw request event sector:208390056 bytes:8 req time:0
++sda Q 3102500593 sector:208390056 bytes:8 act:W
++sda G 3102500593 sector:208390056 bytes:8 act:W
++sda I 3102500593 sector:208390056 bytes:8 act:W
++sda D 3102500593 sector:208390056 bytes:8 act:W
++sda C 3102500593 sector:208390056 bytes:8 act:W
++Wed Aug 28 19:36:42 2019
++dev: sda
++  WRITE iops:42 avg:0.13 min:0.04 max:0.24 speed:0.00MB/s avgrq-sz:5KB
++  [i2d_avg: 0.01 percent:8 min:0.00 max:0.03
++  [d2c_avg: 0.06 percent:48 min:0.03 max:0.10
+diff --git a/tools/aliexitsnoop.py b/tools/aliexitsnoop.py
+new file mode 100755
+index 0000000..4f0b1a1
+--- /dev/null
++++ b/tools/aliexitsnoop.py
+@@ -0,0 +1,302 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++from __future__ import print_function
++
++import argparse
++import ctypes as ct
++import os
++import platform
++import re
++import signal
++import sys
++
++from bcc import BPF
++from bcc.utils import printb
++from datetime import datetime
++from time import strftime
++
++#
++# aliexitsnoop Trace all process termination (exit, fatal signal)
++#           For Linux, uses BCC, eBPF. Embedded C.
++#
++# USAGE: aliexitsnoop [-h] [-x] [-t] [--utc] [--label[=LABEL]] [-p PID]
++#
++_examples = """examples:
++    aliexitsnoop                # trace all process termination
++    aliexitsnoop -x             # trace only fails, exclude exit(0)
++    aliexitsnoop -t             # include timestamps (local time)
++    aliexitsnoop --utc          # include timestamps (UTC)
++    aliexitsnoop -p 181         # only trace PID 181
++    aliexitsnoop --label=exit   # label each output line with 'exit'
++"""
++"""
++  Exit status (from <include/sysexits.h>):
++
++    0 EX_OK        Success
++    2              argparse error
++   70 EX_SOFTWARE  syntax error detected by compiler, or
++                   verifier error from kernel
++   77 EX_NOPERM    Need sudo (CAP_SYS_ADMIN) for BPF() system call
++
++  More information about this script is in bcc/tools/aliexitsnoop_example.txt
++
++  Copyright 2016 Netflix, Inc.
++  Copyright 2019 Instana, Inc.
++  Copyright 2019 Alibaba, Inc.
++  Licensed under the Apache License, Version 2.0 (the "License")
++
++  07-Feb-2016   Brendan Gregg (Netflix)            Created execsnoop
++  04-May-2019   Arturo Martin-de-Nicolas (Instana) Created exitsnoop
++  13-May-2019   Jeroen Soeters (Instana) Refactor to import as module
++  16-Jul-2019   Alvin Zheng (Alibaba) Created aliexitsnoop
++"""
++
++def _getParser():
++    parser = argparse.ArgumentParser(
++        description="Trace all process termination (exit, fatal signal)",
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        epilog=_examples)
++    a = parser.add_argument
++    a("-t", "--timestamp", action="store_true", help="include timestamp (local time default)")
++    a("--utc", action="store_true", help="include timestamp in UTC (-t implied)")
++    a("-p", "--pid", help="trace this PID only")
++    a("--label", help="label each line")
++    a("-x", "--failed", action="store_true", help="trace only fails, exclude exit(0)")
++    # print the embedded C program and exit, for debugging
++    a("--ebpf", action="store_true", help=argparse.SUPPRESS)
++    # RHEL 7.6 keeps task->start_time as struct timespec, convert to u64 nanoseconds
++    a("--timespec", action="store_true", help=argparse.SUPPRESS)
++    return parser.parse_args
++
++
++class Global():
++    parse_args = _getParser()
++    args = None
++    argv = None
++    bpf = None
++    SIGNUM_TO_SIGNAME = dict((v, re.sub("^SIG", "", k))
++        for k, v in signal.__dict__.items() if re.match("^SIG[A-Z]+$", k))
++
++
++class Data(ct.Structure):
++    """Event data matching struct data_t in _embedded_c()."""
++    _TASK_COMM_LEN = 16      # linux/sched.h
++    _pack_ = 1
++    _fields_ = [
++        ("start_time", ct.c_ulonglong),  # task->start_time, see --timespec arg
++        ("exit_time", ct.c_ulonglong),  # bpf_ktime_get_ns()
++        ("pid_tgid", ct.c_ulonglong),
++        ("pid", ct.c_uint),  # task->tgid, thread group id == sys_getpid()
++        ("tid", ct.c_uint),  # task->pid, thread id == sys_gettid()
++        ("ppid", ct.c_uint),  # task->parent->tgid, notified of exit
++        ("exit_code", ct.c_int),
++        ("sig_info", ct.c_uint),
++        ("k_stack_id", ct.c_int),
++        ("u_stack_id", ct.c_int),
++        ("task", ct.c_char * _TASK_COMM_LEN)
++    ]
++
++def _embedded_c(args):
++    """Generate C program for sched_process_exit tracepoint in kernel/exit.c."""
++    c = """
++    EBPF_COMMENT
++    #include <linux/sched.h>
++    BPF_STATIC_ASSERT_DEF
++    BPF_STACK_TRACE(stack_traces,16384);
++    struct data_t {
++        u64 start_time;
++        u64 exit_time;
++        u64 pid_tgid;
++        u32 pid;
++        u32 tid;
++        u32 ppid;
++        int exit_code;
++        u32 sig_info;
++        int k_stack_id;
++        int u_stack_id;
++        char task[TASK_COMM_LEN];
++    } __attribute__((packed));
++
++    BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA);
++    BPF_PERF_OUTPUT(events);
++
++    int kprobe_do_exit(struct pt_regs * ctx, long exit_code)
++    {
++        struct task_struct *task = (typeof(task))bpf_get_current_task();
++        if (FILTER_PID || FILTER_EXIT_CODE) { return 0; }
++
++        struct data_t data = {
++            .start_time = PROCESS_START_TIME_NS,
++            .exit_time = bpf_ktime_get_ns(),
++            .pid_tgid=bpf_get_current_pid_tgid(),
++            .pid = task->tgid,
++            .tid = task->pid,
++            .ppid = task->parent->tgid,
++            .exit_code = task->exit_code >> 8,
++            .sig_info = task->exit_code & 0xFF,
++        };
++        bpf_get_current_comm(&data.task, sizeof(data.task));
++        data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID);
++        data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_USER_STACK|BPF_F_REUSE_STACKID);
++        events.perf_submit(ctx, &data, sizeof(data));
++        return 0;
++    }
++    """
++    # TODO: this macro belongs in bcc/src/cc/export/helpers.h
++    bpf_static_assert_def = r"""
++    #ifndef BPF_STATIC_ASSERT
++    #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \
++    extern int bpf_static_assert[(condition) ? 1 : -1]
++    #endif
++    """
++    code_substitutions = [
++        ('EBPF_COMMENT', '' if not Global.args.ebpf else _ebpf_comment()),
++        ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def),
++        ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))),
++        ('FILTER_PID', '0' if not Global.args.pid else "task->tgid != %s" % Global.args.pid),
++        ('FILTER_EXIT_CODE', '0' if not Global.args.failed else 'task->exit_code == 0'),
++        ('PROCESS_START_TIME_NS', 'task->start_time' if not Global.args.timespec else
++             '(task->start_time.tv_sec * 1000000000L) + task->start_time.tv_nsec'),
++    ]
++    for old, new in code_substitutions:
++        c = c.replace(old, new)
++    return c
++
++def _ebpf_comment():
++    """Return a C-style comment with information about the generated code."""
++    comment = ('Created by %s at %s:\n\t%s' %
++                    (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__))
++    args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n')
++    return ("\n   /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" %
++                             (comment, ' '.join(Global.argv), args))
++                   .replace('\n', '\n\t*').replace('\t', '    '))
++
++def _print_header():
++    if Global.args.timestamp:
++        title = 'TIME-' + ('UTC' if Global.args.utc else strftime("%Z"))
++        print("%-13s" % title, end="")
++    if Global.args.label is not None:
++        print("%-6s" % "LABEL", end="")
++    print("%-16s %-6s %-6s %-6s %-7s %-10s" %
++              ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE"))
++
++def _print_event(cpu, data, size):  # callback
++    """Print the exit event."""
++    stack_traces = Global.bpf['stack_traces']
++    e = ct.cast(data, ct.POINTER(Data)).contents
++    if Global.args.timestamp:
++        now = datetime.utcnow() if Global.args.utc else datetime.now()
++        print("%-13s" % (now.strftime("%H:%M:%S.%f")[:-3]), end="")
++    if Global.args.label is not None:
++        label = Global.args.label if len(Global.args.label) else 'exit'
++        print("%-6s" % label, end="")
++    age = (e.exit_time - e.start_time) / 1e9
++    print("%-16s %-6d %-6d %-6d %-7.2f " %
++              (e.task.decode(), e.pid, e.ppid, e.tid, age), end="")
++    if e.sig_info == 0:
++        print("0" if e.exit_code == 0 else "code %d" % e.exit_code)
++    else:
++        sig = e.sig_info & 0x7F
++        if sig:
++            print("signal %d (%s)" % (sig, signum_to_signame(sig)), end="")
++        if e.sig_info & 0x80:
++            print(", core dumped ", end="")
++        print()
++    # print the kernel stack
++    if e.k_stack_id >= 0:
++        cur_stack = reversed(list(stack_traces.walk(e.k_stack_id)))
++        for addr in cur_stack:
++            printb(b"Kernel Stack:    %-16x  %s" % (addr, Global.bpf.ksym(addr)))
++    else:
++        print("No KERNEL STACK FOUND")
++    # print the user stack
++    if e.u_stack_id >= 0:
++        cur_stack = reversed(list(stack_traces.walk(e.u_stack_id)))
++        for addr in cur_stack:
++            printb(b"USER Stack:      %-16x  %s" % (addr, Global.bpf.sym(addr, e.pid_tgid)))
++    else:
++        print("No USER STACK FOUND")
++    print("=================================")
++    print()
++# =============================
++# Module: These functions are available for import
++# =============================
++def initialize(arg_list=sys.argv[1:]):
++    """Trace all process termination.
++
++    arg_list - list of args, if omitted then uses command line args
++               arg_list is passed to argparse.ArgumentParser.parse_args()
++
++    For example, if arg_list = [ '-x', '-t' ]
++       args.failed == True
++       args.timestamp == True
++
++    Returns a tuple (return_code, result)
++       0 = Ok, result is the return value from BPF()
++       1 = args.ebpf is requested, result is the generated C code
++       os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message
++       os.EX_SOFTWARE: internal software error, result is error message
++    """
++    Global.argv = arg_list
++    Global.args = Global.parse_args(arg_list)
++    if Global.args.utc and not Global.args.timestamp:
++        Global.args.timestamp = True
++    if not Global.args.ebpf and os.geteuid() != 0:
++        return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call")
++    if re.match('^3\.10\..*el7.*$', platform.release()):  # Centos/Red Hat
++        Global.args.timespec = True
++    for _ in range(2):
++        c = _embedded_c(Global.args)
++        if Global.args.ebpf:
++            return (1, c)
++        try:
++            return (os.EX_OK, BPF(text=c))
++        except Exception as e:
++            error = format(e)
++            if (not Global.args.timespec and error.find('struct timespec') and error.find('start_time')):
++                print('This kernel keeps task->start_time in a struct timespec.\n' +
++                          'Retrying with --timespec')
++                Global.args.timespec = True
++                continue
++            return (os.EX_SOFTWARE, "BPF error: " + error)
++        except:
++            return (os.EX_SOFTWARE, "Unexpected error: {0}".format(sys.exc_info()[0]))
++
++def snoop(bpf, event_handler):
++    """Call event_handler for process termination events.
++
++    bpf - result returned by successful initialize()
++    event_handler - callback function to handle termination event
++    args.pid - Return after event_handler is called, only monitoring this pid
++    """
++    bpf["events"].open_perf_buffer(event_handler)
++    while True:
++        bpf.perf_buffer_poll()
++        if Global.args.pid:
++            return
++
++def signum_to_signame(signum):
++    """Return the name of the signal corresponding to signum."""
++    return Global.SIGNUM_TO_SIGNAME.get(signum, "unknown")
++
++# =============================
++# Script: invoked as a script
++# =============================
++def main():
++    try:
++        rc, buffer = initialize()
++        Global.bpf = buffer
++        Global.bpf.attach_kprobe(event="do_exit", fn_name="kprobe_do_exit")
++        if rc:
++            print(buffer)
++            sys.exit(0 if Global.args.ebpf else rc)
++        _print_header()
++        snoop(buffer, _print_event)
++    except KeyboardInterrupt:
++        print()
++        sys.exit()
++
++    return 0
++
++if __name__ == '__main__':
++    main()
+diff --git a/tools/aliexitsnoop_example.txt b/tools/aliexitsnoop_example.txt
+new file mode 100644
+index 0000000..4b60929
+--- /dev/null
++++ b/tools/aliexitsnoop_example.txt
+@@ -0,0 +1,78 @@
++Demonstrations of aliexitsnoop.
++
++This Linux tool is actually an enhanced version of the exitsnoop. For more details about
++exitsnoop, see exitsnoop_example.txt. Compared to the exitsnoop, aliexitsnoop not only print
++the exit code, pid, ppid, lifetime of the exiting processes, but also the stack of the exiting
++processes. The demand stems from the fact that in the production environment, some processes
++end up unexpectedly. Therefore we need to dump the stack of the target processes to find out
++what caused the target process to exit.
++
++Usage:
++aliexitsnoop.py [-h] [-t] [--utc] [-p PID] [--label LABEL] [-x]
++
++Trace all process termination (exit, fatal signal)
++
++optional arguments:
++  -h, --help         show this help message and exit
++  -t, --timestamp    include timestamp (local time default)
++  --utc              include timestamp in UTC (-t implied)
++  -p PID, --pid PID  trace this PID only
++  --label LABEL      label each line
++  -x, --failed       trace only fails, exclude exit(0)
++
++examples:
++    aliexitsnoop                # trace all process termination
++    aliexitsnoop -x             # trace only fails, exclude exit(0)
++    aliexitsnoop -t             # include timestamps (local time)
++    aliexitsnoop --utc          # include timestamps (UTC)
++    aliexitsnoop -p 181         # only trace PID 181
++    aliexitsnoop --label=exit   # label each output line with 'exit'
++
++The following example shows the output example when run in a loop that is interrupted with
++Ctrl-C from the terminal:
++
++grep             25099  25087  25099  0.00    0
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
++Kernel Stack:    ffffffffa5083ad9  do_group_exit
++Kernel Stack:    ffffffffa5082e11  do_exit
++USER Stack:      c74864108b48      [unknown]
++USER Stack:      7fd29b10c660      [unknown]
++USER Stack:      7fd29b0611d9      [unknown]
++=================================
++
++id               25101  25100  25101  0.00    0
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
++Kernel Stack:    ffffffffa5083ad9  do_group_exit
++Kernel Stack:    ffffffffa5082e11  do_exit
++USER Stack:      c74864108b48      [unknown]
++USER Stack:      7f932c07b660      [unknown]
++USER Stack:      7f932bfd01d9      [unknown]
++=================================
++
++bash             25100  25087  25100  0.00    0
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa5083b54  __x64_sys_exit_group
++Kernel Stack:    ffffffffa5083ad9  do_group_exit
++Kernel Stack:    ffffffffa5082e11  do_exit
++USER Stack:      c74864108b48      [unknown]
++USER Stack:      7f5318862660      [unknown]
++USER Stack:      7f53187b71d9      [unknown]
++=================================
++
++systemd          1      0      25102  0.00    0
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa5083a97  __x64_sys_exit
++Kernel Stack:    ffffffffa5082e11  do_exit
++USER Stack:      7f92232c1e64      [unknown]
++
++We can see that aliexitsnoop prints the user stack and kernel stack of the exiting processes.
++What needs to be noted is that aliexitsnoop can only get the address track of the user stack.
++The reason why aliexitsnoop cannot get the symbols corresponding the addresses is that the
++user space of the process does not exists after the process ends. However, the address track
++of the user stack still helps, because you can analyze together with /proc/$pid/maps.
+diff --git a/tools/aliext4writeslower.py b/tools/aliext4writeslower.py
+new file mode 100755
+index 0000000..db626c7
+--- /dev/null
++++ b/tools/aliext4writeslower.py
+@@ -0,0 +1,411 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# aliext4writeslower  Trace slow ext4 write operation.
++#             For Linux, uses BCC, eBPF.
++#
++# USAGE: aliext4writeslower [-h] [-p PID] [-d device] [min_ms]
++#
++# This script traces key steps of write(2) which maybe time-consuming,
++# here trace 4 key steps: jbd2 handle start(it may need jbd2 trancation
++# checkpoint or commit), page cache allocation, block allocation and
++# balance dirty pages, and we also take into account sched delay.
++#
++# With this tool, we can clearly know write(2)'s latency proportion.
++#
++# WARNING: This adds low-overhead instrumentation to these ext4 operations,
++# including reads and writes from the file system cache. Such reads and writes
++# can be very frequent (depending on the workload; eg, 1M/sec), at which
++# point the overhead of this tool (even if it prints no "slower" events) can
++# begin to become significant.
++#
++# By default, a minimum millisecond threshold of 10 is used.
++#
++# Licensed under the Apache License, Version 2.0 (the "License")
++#
++# 2019-08-29   Xiaoguang Wang   Created this.
++
++from __future__ import print_function
++from bcc import BPF
++import argparse
++from time import strftime
++import ctypes as ct
++
++# symbols
++kallsyms = "/proc/kallsyms"
++
++# arguments
++examples = """examples:
++    ./aliext4writeslower             # trace write(2) slower than 10 ms (default)
++    ./aliext4writeslower 1           # trace write(2) slower than 1 ms
++    ./aliext4writeslower 0           # trace all operations (warning: verbose)
++    ./aliext4writeslower -d sda5     # inspect specified device /dev/sda5 containing ext4 fs
++    ./aliext4writeslower -p 185      # trace PID 185 only
++"""
++parser = argparse.ArgumentParser(
++    description="Trace common ext4 write(2) slower than a threshold",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-p", "--pid", help="trace this PID only")
++parser.add_argument("-d", "--device", help="inspect specified device")
++parser.add_argument("min_ms", nargs="?", default='10',
++    help="minimum I/O duration to trace, in ms (default 10)")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++args = parser.parse_args()
++min_ms = int(args.min_ms)
++pid = args.pid
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++#include <linux/sched.h>
++#include <linux/blkdev.h>
++
++enum {
++	TRACE_PAGECACHE_ALLOC = 0,
++	TRACE_JOURNAL_START,
++	TRACE_BLOCK_ALLOC,
++	TRACE_BALANCE_DIRTY,
++	TRACE_WRITE_BEGIN,
++	TRACE_WRITE_END,
++	TRACE_TYPE_MAX,
++};
++
++struct val_t {
++	dev_t dev;
++	u64 ts;
++	u64 sched_delay;
++	u64 trace_ts[TRACE_TYPE_MAX];
++	u64 trace_time[TRACE_TYPE_MAX];
++	u64 offset;
++	struct file *fp;
++};
++
++struct data_t {
++	dev_t dev;
++	u64 size;
++	u64 offset;
++	u64 delta_us;
++	u64 pagecache_alloc_us;
++	u64 journal_start_us;
++	u64 block_alloc_us;
++	u64 balance_dirty_us;
++	u64 write_begin_us;
++	u64 write_end_us;
++	u64 sched_delay;
++	u64 pid;
++	char task[TASK_COMM_LEN];
++	char file[DNAME_INLINE_LEN];
++};
++
++BPF_HASH(entryinfo, u64, struct val_t);
++BPF_PERF_OUTPUT(events);
++
++
++// ext4_file_write_iter():
++int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
++{
++	struct val_t val = {};
++	u64 id = bpf_get_current_pid_tgid();
++	u32 pid = id >> 32; // PID is higher part
++	struct task_struct *p = (struct task_struct *)bpf_get_current_task();
++	struct inode *inode = iocb->ki_filp->f_inode;
++	dev_t dev = inode->i_sb->s_bdev->bd_dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	if (FILTER_PID)
++		return 0;
++
++	val.ts = bpf_ktime_get_ns();
++	val.sched_delay = p->sched_info.run_delay;
++	val.fp = iocb->ki_filp;
++	val.dev = dev;
++	val.offset = iocb->ki_pos;
++	if (val.fp)
++		entryinfo.update(&id, &val);
++
++	return 0;
++}
++
++int trace_write_return(struct pt_regs *ctx)
++{
++	struct val_t *valp;
++	u64 ts, delta_us, id = bpf_get_current_pid_tgid();
++	u32 size, pid = id >> 32;
++	struct task_struct *p = (struct task_struct *)bpf_get_current_task();
++	struct data_t data = {};
++
++	if (FILTER_PID)
++		return 0;
++
++	valp = entryinfo.lookup(&id);
++	if (valp == 0)
++		return 0;
++
++	ts = bpf_ktime_get_ns();
++	delta_us = (ts - valp->ts) / 1000;
++	entryinfo.delete(&id);
++	if (FILTER_US)
++		return 0;
++
++	size = PT_REGS_RC(ctx);
++	data.size = size;
++	data.delta_us = delta_us;
++	data.pid = pid;
++	data.sched_delay = (p->sched_info.run_delay - valp->sched_delay) / 1000;
++	data.offset = valp->offset;
++	data.dev = valp->dev;
++	data.pagecache_alloc_us = valp->trace_time[TRACE_PAGECACHE_ALLOC] / 1000;
++	data.journal_start_us = valp->trace_time[TRACE_JOURNAL_START] / 1000;
++	data.block_alloc_us = valp->trace_time[TRACE_BLOCK_ALLOC] / 1000;
++	data.balance_dirty_us = valp->trace_time[TRACE_BALANCE_DIRTY] / 1000;
++	data.write_begin_us = valp->trace_time[TRACE_WRITE_BEGIN] / 1000;
++	data.write_end_us = valp->trace_time[TRACE_WRITE_END] / 1000;
++	bpf_get_current_comm(&data.task, sizeof(data.task));
++
++	// workaround (rewriter should handle file to d_name in one step):
++	struct dentry *de = NULL;
++	struct qstr qs = {};
++	bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry);
++	bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name);
++	if (qs.len == 0)
++		return 0;
++	bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
++
++	// output
++	events.perf_submit(ctx, &data, sizeof(data));
++
++	return 0;
++}
++
++
++static int trace_type_entry(struct pt_regs *ctx, int type)
++{
++    struct val_t *valp;
++    u64 id = bpf_get_current_pid_tgid();
++    u32 pid = id >> 32; // PID is higher part
++    u64 ts = bpf_ktime_get_ns();
++
++    valp = entryinfo.lookup(&id);
++    if (valp == 0) {
++        // missed tracing issue or filtered
++        return 0;
++    }
++
++    valp->trace_ts[type] = ts;
++    return 0;
++}
++
++static int trace_type_return(struct pt_regs *ctx, int type)
++{
++    struct val_t *valp;
++    u64 id = bpf_get_current_pid_tgid();
++    u32 pid = id >> 32; // PID is higher part
++    u64 ts = bpf_ktime_get_ns();
++
++    valp = entryinfo.lookup(&id);
++    if (valp == 0) {
++        // missed tracing issue or filtered
++        return 0;
++    }
++
++    valp->trace_time[type] += ts - valp->trace_ts[type];;
++    return 0;
++}
++
++int trace_pagecache_alloc_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_PAGECACHE_ALLOC);
++}
++
++int trace_pagecache_alloc_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_PAGECACHE_ALLOC);
++}
++
++int trace_journal_start_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_JOURNAL_START);
++}
++
++int trace_journal_start_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_JOURNAL_START);
++}
++
++int trace_block_alloc_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_BLOCK_ALLOC);
++}
++
++int trace_block_alloc_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_BLOCK_ALLOC);
++}
++
++int trace_balance_dirty_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_BALANCE_DIRTY);
++}
++
++int trace_balance_dirty_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_BALANCE_DIRTY);
++}
++
++int trace_aops_write_begin_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_WRITE_BEGIN);
++}
++
++int trace_aops_write_begin_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_WRITE_BEGIN);
++}
++
++int trace_aops_write_end_entry(struct pt_regs *ctx)
++{
++	return trace_type_entry(ctx, TRACE_WRITE_END);
++}
++
++int trace_aops_write_end_return(struct pt_regs *ctx)
++{
++	return trace_type_return(ctx, TRACE_WRITE_END);
++}
++"""
++
++devid2name={}
++num_trans=0
++
++def init_dev_name():
++        global devid2name
++
++        f = open("/proc/partitions")
++        for line in f.readlines():
++                line = line.strip()
++                if not len(line):
++                        continue;
++
++                str = line.split()
++                if str[0].isalpha():
++                        continue
++                dev_num = (int(str[0]) << 20) + int(str[1])
++                devid2name[dev_num] = str[3]
++        f.close()
++
++init_dev_name()
++
++name2devid={v:k for k,v in devid2name.items()}
++
++if args.device:
++    bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
++else:
++    bpf_text = bpf_text.replace('FILTER_DEV', '0')
++
++if min_ms == 0:
++    bpf_text = bpf_text.replace('FILTER_US', '0')
++else:
++    bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_ms * 1000))
++
++if args.pid:
++    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
++else:
++    bpf_text = bpf_text.replace('FILTER_PID', '0')
++
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++# kernel->user event data: struct data_t
++DNAME_INLINE_LEN = 32   # linux/dcache.h
++TASK_COMM_LEN = 16      # linux/sched.h
++class Data(ct.Structure):
++    _fields_ = [
++        ("dev", ct.c_uint),
++        ("size", ct.c_ulonglong),
++        ("offset", ct.c_ulonglong),
++        ("delta_us", ct.c_ulonglong),
++        ("pagecache_alloc_us", ct.c_ulonglong),
++        ("journal_start_us", ct.c_ulonglong),
++        ("block_alloc_us", ct.c_ulonglong),
++        ("balance_dirty_us", ct.c_ulonglong),
++        ("write_begin_us", ct.c_ulonglong),
++        ("write_end_us", ct.c_ulonglong),
++        ("sched_delay", ct.c_ulonglong),
++        ("pid", ct.c_ulonglong),
++        ("task", ct.c_char * TASK_COMM_LEN),
++        ("file", ct.c_char * DNAME_INLINE_LEN)
++    ]
++
++# process event
++def print_event(cpu, data, size):
++    event = ct.cast(data, ct.POINTER(Data)).contents
++
++    print("%-8s %-8s %-14.14s %-6s %-7s %-8d %7.2f %8.2f %10.2f %11.2f %13.2f %11.2f %9.2f %11.2f %s" %
++	(strftime("%H:%M:%S"), devid2name[event.dev],
++        event.task.decode(), event.pid, event.size, event.offset / 1024,
++        float(event.delta_us) / 1000, float(event.pagecache_alloc_us) / 1000,
++	float(event.journal_start_us) / 1000, float(event.block_alloc_us) / 1000,
++	float(event.balance_dirty_us) / 1000, float(event.write_begin_us) / 1000,
++	float(event.write_end_us) / 1000, float(event.sched_delay) / 1000,
++	event.file.decode()))
++
++# initialize BPF
++b = BPF(text=bpf_text)
++
++b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry")
++b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
++
++# trace address_space_operations .write_begin()
++b.attach_kprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_entry")
++b.attach_kprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_entry")
++b.attach_kretprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_return")
++b.attach_kretprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_return")
++
++# trace address_space_operations .write_end()
++b.attach_kprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_entry")
++b.attach_kprobe(event="ext4_write_end", fn_name="trace_aops_write_end_entry")
++b.attach_kretprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_return")
++b.attach_kretprobe(event="ext4_write_end", fn_name="trace_aops_write_end_return")
++
++# trace page cache allocation
++b.attach_kprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_entry")
++b.attach_kretprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_return")
++
++# trace journal start
++b.attach_kprobe(event="jbd2__journal_start", fn_name="trace_journal_start_entry")
++b.attach_kretprobe(event="jbd2__journal_start", fn_name="trace_journal_start_return")
++
++# trace block allocation
++b.attach_kprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_entry")
++b.attach_kretprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_return")
++b.attach_kprobe(event="_ext4_get_block", fn_name="trace_block_alloc_entry")
++b.attach_kretprobe(event="_ext4_get_block", fn_name="trace_block_alloc_return")
++
++# trace balance dirty pages
++b.attach_kprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_entry")
++b.attach_kretprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_return")
++
++# header
++if min_ms == 0:
++    print("Tracing ext4 operations")
++else:
++    print("Tracing ext4 operations slower than %d ms" % min_ms)
++
++print("%-8s %-8s %-14s %-6s %-7s %-8s %7s %8s %10s %11s %13s %11s %9s %11s %s" % ("TIME",
++      "DEVICE", "COMM", "PID", "BYTES", "OFF_KB", "LAT(ms)", "pg_alloc", "jbd2_start",
++      "block_alloc", "balance_dirty", "write_begin", "write_end", "sched_delay", "FILENAME"))
++
++# read events
++b["events"].open_perf_buffer(print_event, page_cnt=64)
++while 1:
++    try:
++        b.perf_buffer_poll()
++
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/aliext4writeslower_example.txt b/tools/aliext4writeslower_example.txt
+new file mode 100644
+index 0000000..b702d9d
+--- /dev/null
++++ b/tools/aliext4writeslower_example.txt
+@@ -0,0 +1,68 @@
++Demonstrations of aliext4writeslower.
++
++This script traces key steps of write(2) which maybe time-consuming,
++here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction
++checkpoint or commit), page cache allocation, block allocation and
++balance dirty pages, and we also take into account sched delay.
++With this tool, we can clearly know write(2)'s latency proportion.
++
++Usage:
++usage: aliext4writeslower [-h] [-p PID] [-d DEVICE] [min_ms]
++
++Trace common ext4 write(2) slower than a threshold
++
++positional arguments:
++  min_ms                minimum I/O duration to trace, in ms (default 10)
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -p PID, --pid PID     trace this PID only
++  -d DEVICE, --device DEVICE
++                        inspect specified device
++
++examples:
++    ./aliext4writeslower             # trace write(2) slower than 10 ms (default)
++    ./aliext4writeslower 1           # trace write(2) slower than 1 ms
++    ./aliext4writeslower 0           # trace all operations (warning: verbose)
++    ./aliext4writeslower -d sda5     # inspect specified device /dev/sda5 containing ext4 fs
++    ./aliext4writeslower -p 185      # trace PID 185 only
++
++
++Output example:
++
++Example 1:
++# Trace write(2) slower than 5 ms
++python aliext4writeslower.py 5
++
++Tracing ext4 operations slower than 5 ms
++TIME     DEVICE   COMM           PID    BYTES   OFF_KB   LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME
++17:14:07 sda5     dd             31149  1048576 473088      5.15     1.02       0.68        0.25          0.54        2.50      1.57        0.00 testfile
++17:14:07 sda5     dd             31149  1048576 474112      5.27     1.07       0.68        0.27          0.54        2.58      1.59        0.00 testfile
++17:14:07 sda5     dd             31149  1048576 475136      5.09     1.01       0.69        0.30          0.53        2.52      1.49        0.00 testfile
++17:14:07 sda5     dd             31149  1048576 607232      5.12     1.09       0.67        0.21          0.54        2.52      1.50        0.00 testfile
++17:14:07 sda5     dd             31149  1048576 637952      6.88     1.20       1.15        0.34          0.75        3.38      1.97        0.00 testfile
++17:14:07 sda5     dd             31149  1048576 674816      5.29     1.17       0.68        0.20          0.55        2.60      1.58        0.00 testfile
++17:14:08 sda5     dd             31149  1048576 738304      5.08     0.99       0.69        0.27          0.54        2.46      1.54        0.00 testfile
++17:14:08 sda5     dd             31149  1048576 790528      5.03     0.94       0.69        0.25          0.54        2.42      1.50        0.00 testfile
++17:14:08 sda5     dd             31149  1048576 803840      5.04     0.96       0.67        0.29          0.54        2.45      1.51        0.00 testfile
++17:14:08 sda5     dd             31149  1048576 870400      5.15     1.10       0.68        0.19          0.55        2.52      1.53        0.00 testfile
++17:14:09 sda5     dd             31149  1048576 986112      5.08     1.00       0.69        0.27          0.54        2.48      1.51        0.00 testfile
++
++
++Example 2:
++# Inspect specified device /dev/sda5 containing ext4 fs
++python aliext4writeslower.py -d sda5
++
++Tracing ext4 operations
++TIME     DEVICE   COMM           PID    BYTES   OFF_KB   LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME
++17:16:24 sda3     su             32894  292     0           0.03     0.00       0.00        0.00          0.00        0.01      0.00        0.00 lastlog
++17:16:24 sda3     syslog-ng      990    67      456973      0.07     0.01       0.01        0.00          0.00        0.02      0.01        0.00 secure-20190805
++17:16:24 sda3     syslog-ng      990    106     456973      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 secure-20190805
++17:16:24 sda3     syslog-ng      990    67      475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
++17:16:24 sda3     syslog-ng      990    106     475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
++17:16:24 sda3     ilogtail       56674  199     3296        0.04     0.00       0.01        0.00          0.00        0.01      0.01        0.00 000147.log
++17:16:24 sda3     syslog-ng      990    95      456974      0.04     0.01       0.01        0.00          0.00        0.01      0.01        0.00 secure-20190805
++17:16:24 sda3     syslog-ng      990    95      475637      0.02     0.00       0.00        0.00          0.00        0.01      0.01        0.00 messages-20190805
++17:16:24 sda3     xperf          2558   874     403         0.03     0.00       0.01        0.00          0.00        0.01      0.01        0.00 xperf.pmu.data.0
++17:16:25 sda3     xperf          2558   819     403         0.06     0.01       0.01        0.00          0.01        0.02      0.01        0.00 xperf.pmu.data.0
++17:16:25 sda3     syslog-ng      990    175     456974      0.05     0.01       0.01        0.00          0.00        0.01      0.01        0.00 secure-20190805
+diff --git a/tools/alihardirqs.py b/tools/alihardirqs.py
+new file mode 100755
+index 0000000..b8b0500
+--- /dev/null
++++ b/tools/alihardirqs.py
+@@ -0,0 +1,219 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alihardirqs   Summarize hard IRQ (interrupt) event time.
++#               Enhanced version of BCC hardirqs.
++#               For Linux, uses BCC, eBPF.
++#
++# USAGE: alihardirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs]
++#
++# Copyright (c) 2015 Brendan Gregg.
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++from time import sleep, strftime
++import argparse
++
++# arguments
++examples = """examples:
++    ./alihardirqs            # sum hard irq event time
++    ./alihardirqs -D         # show hard irq event time as histograms
++    ./alihardirqs -c 0       # show hard irq event time of CPU 0 only
++    ./alihardirqs -i 25      # show event time of IRQ 25 only
++    ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
++    ./alihardirqs 1 10       # print 1 second summaries, 10 times
++    ./alihardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize hard irq statistics, including total count, time, etc.",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-T", "--timestamp", action="store_true",
++    help="include timestamp on output")
++parser.add_argument("-N", "--nanoseconds", action="store_true",
++    help="output in nanoseconds")
++parser.add_argument("-D", "--dist", action="store_true",
++    help="show distributions as histograms")
++parser.add_argument("-c", "--cpu",
++    help="output hardIRQ statistics on specific CPU only")
++parser.add_argument("-i", "--irq",
++    help="output statistics of specific hardIRQ")
++parser.add_argument("interval", nargs="?", default=99999999,
++    help="output interval, in seconds")
++parser.add_argument("outputs", nargs="?", default=99999999,
++    help="number of outputs")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++countdown = int(args.outputs)
++distmode = "1" if args.dist else "0"
++if args.nanoseconds:
++    factor = 1
++    label = "nsecs"
++else:
++    factor = 1000
++    label = "usecs"
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++typedef struct irq_key {
++    u32 cpu;
++    u32 irq;
++    char name[32];
++} irq_key_t;
++
++typedef struct irq_key_slot {
++    irq_key_t key;
++    u64 slot;
++} irq_key_slot_t;
++
++typedef struct irq_val {
++    u64 count;
++    u64 time;
++    u64 time_max;
++} irq_val_t;
++
++typedef struct stamp {
++    u64 ts;
++    char name[32];
++} stamp_t;
++
++BPF_PERCPU_ARRAY(start, stamp_t, 1);
++BPF_HASH(res, irq_key_t, irq_val_t);
++BPF_HISTOGRAM(dist, irq_key_slot_t);
++
++
++TRACEPOINT_PROBE(irq, irq_handler_entry)
++{
++    COND_FILTER
++
++    u32 idx = 0;
++    stamp_t stamp = {
++        .ts = bpf_ktime_get_ns(),
++    };
++    TP_DATA_LOC_READ_CONST(stamp.name, name, sizeof(stamp.name));
++    start.update(&idx, &stamp);
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(irq, irq_handler_exit)
++{
++    u64 delta;
++    u32 cpu = bpf_get_smp_processor_id();
++
++    u32 idx = 0;
++    // fetch timestamp and calculate delta
++    stamp_t *stampp = start.lookup(&idx);
++    if (!stampp || stampp->ts == 0) {
++        return 0;   // missed start
++    }
++
++    delta = bpf_ktime_get_ns() - stampp->ts;
++    irq_key_t key = {.cpu = cpu, .irq = args->irq};
++    char *name = (char *)stampp->name;
++    bpf_probe_read(key.name, sizeof(key.name), name);
++
++#if DIST_MODE
++    irq_key_slot_t key_slot = {
++        .key = key,
++        .slot = bpf_log2l(delta / FACTOR),
++    };
++    dist.increment(key_slot);
++#else
++    struct irq_val *valp, val;
++    valp = res.lookup(&key);
++
++    if (valp) {
++        valp->count += 1;
++        valp->time += delta;
++        if (valp->time_max < delta) {valp->time_max = delta;}
++    }
++    else {
++        val.count = 1;
++        val.time = val.time_max = delta;
++        res.update(&key, &val);
++    }
++#endif
++    stamp_t zero = {};
++    start.update(&idx, &zero);
++
++    return 0;
++}
++"""
++
++cond_filter = ''
++cpu_filter = "1"
++if args.cpu:
++    cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu
++
++irq_filter = "1"
++if args.irq:
++    irq_filter = "args->irq == %s" % args.irq
++
++cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter)
++bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
++
++bpf_text = bpf_text.replace('FACTOR', "%d" % factor)
++bpf_text = bpf_text.replace('DIST_MODE', distmode)
++
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++def print_section(key):
++    return "CPU %d, hardIRQ %d (%s)" % (key[0], key[1], key[2].decode('utf-8', 'replace'))
++
++
++# load BPF program
++b = BPF(text=bpf_text)
++print("Tracing hard irq event time... Hit Ctrl-C to end.")
++
++if args.dist:
++    tab = b.get_table("dist")
++else:
++    tab = b.get_table("res")
++
++
++# output
++while (1):
++    try:
++        sleep(int(args.interval))
++    except KeyboardInterrupt:
++        countdown = 1
++    print()
++    if args.timestamp:
++        print("%-8s\n" % strftime("%H:%M:%S"), end="")
++
++    if args.dist:
++        tab.print_log2_hist("time_" + label, "section",
++            section_print_fn=print_section,
++            bucket_fn=lambda k: (k.cpu, k.irq, k.name),
++            bucket_sort_fn=lambda keys: sorted(keys))
++    else:
++        cur_cpu = -1
++        for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)):
++            if cur_cpu != k.cpu:
++                print("\nCPU%d:" % k.cpu)
++                print("%10s %20s %10s %10s %10s" %
++                    ("HARDIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label))
++                cur_cpu = k.cpu
++
++            print("%10d %20s %10d %10d %10d" %
++                (k.irq, k.name.decode('utf-8', 'replace'), v.count, v.time / factor, v.time_max / factor))
++
++        tab.clear()
++
++    countdown -= 1
++    if countdown == 0:
++        exit()
+diff --git a/tools/alihardirqs_example.txt b/tools/alihardirqs_example.txt
+new file mode 100644
+index 0000000..caebb47
+--- /dev/null
++++ b/tools/alihardirqs_example.txt
+@@ -0,0 +1,93 @@
++Demonstrations of alihardirqs, the enhanced version of BCC hardirqs.
++
++This program traces hard interrupts (irqs), and stores timing statistics
++in-kernel for efficiency.
++
++
++Example 1
++
++By default, all statistics of all hardirq on all CPUs are sampled,
++and the output is displayed in a per-hardirq and per-CPU flavor.
++
++# alihardirqs
++Tracing hard irq event time... Hit Ctrl-C to end.
++^C
++
++CPU3:
++   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++        31      virtio0-input.0          2          4          2
++
++
++Example 2
++
++You can sample statistics on specified CPU only with "-c" option,
++and the output will be displayed in a per-hardirq flavor.
++
++# alihardirqs -c 3
++Tracing hard irq event time... Hit Ctrl-C to end.
++^C
++
++CPU3:
++   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++        31      virtio0-input.0          3         10          7
++
++
++Exampel 3
++
++You can sample statistics of specified hardirq only with "-i" option,
++and the output will be displayed in a per-CPU flavor.
++
++# alihardirqs -i 31
++Tracing hard irq event time... Hit Ctrl-C to end.
++^C
++
++CPU3:
++   HARDIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++        31      virtio0-input.0          3          8          6
++
++
++Example 4
++
++The distribution of hardirq run time can be printed as a histogram with "-D"
++option. The output will be displayed in a per-hardirq and per-CPU flavor.
++
++# alihardirqs -D
++Tracing hard irq event time... Hit Ctrl-C to end.
++^C
++
++section = CPU 3, hardIRQ 31 (virtio0-input.0)
++     time_usecs          : count     distribution
++         0 -> 1          : 1        |****************************************|
++         2 -> 3          : 0        |                                        |
++         4 -> 7          : 1        |****************************************|
++
++
++
++USAGE message:
++
++# alihardirqs -h
++usage: alihardirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ]
++                       [interval] [outputs]
++
++Summarize hard irq statistics, including total count, time, etc.
++
++positional arguments:
++  interval           output interval, in seconds
++  outputs            number of outputs
++
++optional arguments:
++  -h, --help         show this help message and exit
++  -T, --timestamp    include timestamp on output
++  -N, --nanoseconds  output in nanoseconds
++  -D, --dist         show distributions as histograms
++  -c CPU, --cpu CPU  output hardIRQ statistics on specific CPU only
++  -i IRQ, --irq IRQ  output statistics of specific hardIRQ
++
++examples:
++    ./alihardirqs            # sum hard irq event time
++    ./alihardirqs -D         # show hard irq event time as histograms
++    ./alihardirqs -c 0       # show hard irq event time of CPU 0 only
++    ./alihardirqs -i 25      # show event time of IRQ 25 only
++    ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
++    ./alihardirqs 1 10       # print 1 second summaries, 10 times
++    ./alihardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+diff --git a/tools/alijbd2stat.py b/tools/alijbd2stat.py
+new file mode 100755
+index 0000000..f727fc1
+--- /dev/null
++++ b/tools/alijbd2stat.py
+@@ -0,0 +1,392 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alijbd2stat    Summarize filesystem jbd2 stat.
++#       For Linux, uses BCC, eBPF.
++#
++# USAGE: alijbd2stat [-h] [-t thresh] [-d device]
++#
++# Copyright (c) 2019-2021 Alibaba Group.
++# Licensed under the Apache License, Version 2.0 (the "License")
++#
++# 2019/08/05 Xiaoguang Wang Created this.
++
++from __future__ import print_function
++from bcc import BPF
++import ctypes as ct
++import argparse
++import time
++
++# arguments
++examples = """examples:
++    ./alijbd2stat          # summarize filesystem jbd2 stat
++    ./alijbd2stat -d sda3  # inspect specified device /dev/sda3
++    ./alijbd2stat -t 10    # show jbd2 handle's context when it runs more than 10ms
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize filesystem jbd2 stat",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-d", "--device", help="inspect specified device")
++parser.add_argument("-t", "--thresh_time", nargs="?", default=0,
++    help="show jbd2 handle's context when its run time is greater than this value")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++args = parser.parse_args()
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++#include <linux/sched.h>
++#include <linux/blkdev.h>
++#include <linux/jbd2.h>
++
++struct handle_stat {
++	dev_t dev;
++	u32 type;
++	u32 line_no;
++	unsigned long tid;
++	unsigned long pre_start_time;
++	unsigned long start_time;
++	unsigned long end_time;
++	u64 sched_delay;
++	u64 kernel_ip;
++	int kernel_stack_id;
++	u32 pid;
++	char comm[TASK_COMM_LEN];
++};
++
++struct transaction_run_stat {
++	dev_t dev;
++	u32 handle_count;
++	u32 blocks;
++	u32 blocks_logged;
++	unsigned long tid;
++	unsigned long wait;
++	unsigned long request_delay;
++	unsigned long running;
++	unsigned long locked;
++	unsigned long flushing;
++	unsigned long logging;
++};
++
++/*
++struct handle_info_per_type {
++	u32 count
++};
++*/
++
++BPF_HASH(handle_stat_map, struct task_struct *, struct handle_stat);
++BPF_STACK_TRACE(stack_traces, 1024);
++BPF_PERF_OUTPUT(jbd2_handle_stat);
++BPF_PERF_OUTPUT(transaction_stat);
++
++
++int trace_start_this_handle(struct pt_regs *ctx, journal_t *journal,
++		handle_t *handle, gfp_t gfp_mask)
++{
++	struct handle_stat s;
++	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
++	dev_t dev = journal->j_fs_dev->bd_dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	memset(&s, 0, sizeof(struct handle_stat));
++	s.dev = dev;
++	s.pre_start_time = bpf_ktime_get_ns();
++	s.sched_delay = t->sched_info.run_delay;
++
++	handle_stat_map.update(&t, &s);
++	return 0;
++}
++
++TRACEPOINT_PROBE(jbd2, jbd2_handle_start)
++{
++	struct handle_stat *s;
++	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
++	dev_t dev = args->dev;
++
++	if (FILTER_DEV)
++		return 0;
++
++	s = handle_stat_map.lookup(&t);
++	if (s == NULL)
++		return 0;
++
++	s->start_time = bpf_ktime_get_ns();
++	s->type = args->type;
++	s->line_no = args->line_no;
++	s->tid = args->tid;
++	return 0;
++}
++
++int trace_jbd2_journal_stop(struct pt_regs *ctx, handle_t *handle)
++{
++	transaction_t *transaction = handle->h_transaction;
++	journal_t *journal;
++	struct handle_stat *s;
++	struct handle_stat s2 = {};
++	struct task_struct *t = (struct task_struct *)bpf_get_current_task();
++	unsigned long end_time = bpf_ktime_get_ns();
++	unsigned long run_time;
++
++	if (transaction == NULL) {
++		handle_stat_map.delete(&t);
++		return 0;
++	}
++
++	journal = transaction->t_journal;
++	dev_t dev = journal->j_fs_dev->bd_dev;
++	if (FILTER_DEV)
++		return 0;
++
++	if (handle->h_ref >= 2)
++		return 0;
++
++	s = handle_stat_map.lookup(&t);
++	if (s == NULL)
++		return 0;
++
++	/*
++	 * This could happen, jbd2_journal_start_reserved() miss a trace
++	 * jbd2_handle_start().
++	 */
++	if (!s->start_time)
++		s->start_time = s->pre_start_time;
++
++
++	run_time = end_time - s->pre_start_time;
++	if (run_time < FILTER_THRESH_TIME * 1000 * 1000LLU) {
++		handle_stat_map.delete(&t);
++		return 0;
++	}
++
++	s2.dev = s->dev;
++	s2.type = s->type;
++	s2.line_no = s->line_no;
++	s2.tid = s->tid;
++	s2.pre_start_time = s->pre_start_time;
++	s2.start_time = s->start_time;
++	s2.end_time = end_time;
++	s2.pid = bpf_get_current_pid_tgid();
++	s2.sched_delay = t->sched_info.run_delay - s->sched_delay;
++	bpf_get_current_comm(&s2.comm, sizeof(s2.comm));
++
++	s2.kernel_stack_id = stack_traces.get_stackid(ctx, 0);
++	if (s2.kernel_stack_id >= 0) {
++		// populate extras to fix the kernel stack
++		u64 ip = PT_REGS_IP(ctx);
++		u64 page_offset;
++
++		// if ip isn't sane, leave key ips as zero for later checking
++		#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
++			// x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
++			page_offset = __PAGE_OFFSET_BASE;
++		#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
++			// x64, 4.17, and later
++		#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
++			page_offset = __PAGE_OFFSET_BASE_L5;
++		#else
++			page_offset = __PAGE_OFFSET_BASE_L4;
++		#endif
++		#else
++			// earlier x86_64 kernels, e.g., 4.6, comes here
++			// arm64, s390, powerpc, x86_32
++			page_offset = PAGE_OFFSET;
++		#endif
++
++		if (ip > page_offset)
++			s2.kernel_ip = ip;
++	}
++
++	jbd2_handle_stat.perf_submit(ctx, &s2, sizeof(struct handle_stat));
++	handle_stat_map.delete(&t);
++	return 0;
++}
++
++TRACEPOINT_PROBE(jbd2, jbd2_run_stats)
++{
++	struct transaction_run_stat s;
++	dev_t dev = args->dev;;
++
++	if (FILTER_DEV)
++		return 0;
++
++	memset(&s, 0, sizeof(struct transaction_run_stat));
++	s.dev = args->dev;
++	s.tid = args->tid;
++	s.wait = args->wait;
++	s.request_delay = args->request_delay;
++	s.running = args->running;
++	s.locked = args->locked;
++	s.flushing = args->flushing;
++	s.logging = args->logging;
++	s.handle_count = args->handle_count;
++	s.blocks = args->blocks;
++	s.blocks_logged = args->blocks_logged;
++
++	transaction_stat.perf_submit(args, &s, sizeof(struct transaction_run_stat));
++	return 0;
++}
++
++"""
++devid2name={}
++num_trans=0
++
++def init_dev_name():
++        global devid2name
++
++        f = open("/proc/partitions")
++        for line in f.readlines():
++                line = line.strip()
++                if not len(line):
++                        continue;
++
++                str = line.split()
++                if str[0].isalpha():
++                        continue
++                dev_num = (int(str[0]) << 20) + int(str[1])
++                devid2name[dev_num] = str[3]
++        f.close()
++
++init_dev_name()
++
++name2devid={v:k for k,v in devid2name.items()}
++
++if args.device:
++	bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device])
++else:
++	bpf_text = bpf_text.replace('FILTER_DEV', '0')
++
++if args.thresh_time:
++	bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '%s' % args.thresh_time)
++else:
++	bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '9999')
++
++# code substitutions
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++# load BPF program
++b = BPF(text=bpf_text)
++
++b.attach_kprobe(event="start_this_handle", fn_name="trace_start_this_handle")
++b.attach_kprobe(event="jbd2_journal_stop", fn_name="trace_jbd2_journal_stop")
++
++print("Tracing jbd2 stats... Hit Ctrl-C to end.")
++
++TASK_COMM_LEN = 16
++
++class HandleStat(ct.Structure):
++	_fields_ = [
++		("dev", ct.c_uint),
++		("type", ct.c_uint),
++		("line_no", ct.c_uint),
++		("tid", ct.c_ulonglong),
++		("pre_start_time", ct.c_ulonglong),
++		("start_time", ct.c_ulonglong),
++		("end_time", ct.c_ulonglong),
++		("sched_delay", ct.c_ulonglong),
++		("kernel_ip", ct.c_ulonglong),
++		("kernel_stack_id", ct.c_int),
++		("pid", ct.c_uint),
++		("comm", ct.c_char * TASK_COMM_LEN),
++    ]
++
++class Data(ct.Structure):
++	_fields_ = [
++		("dev", ct.c_uint),
++		("handle_count", ct.c_uint),
++		("blocks", ct.c_uint),
++		("blocks_logged", ct.c_uint),
++		("tid", ct.c_ulonglong),
++		("wait", ct.c_ulonglong),
++		("request_delay", ct.c_ulonglong),
++		("running", ct.c_ulonglong),
++		("locked", ct.c_ulonglong),
++		("flushing", ct.c_ulonglong),
++		("logging", ct.c_ulonglong),
++    ]
++
++class trans_stat:
++	def __init__(self, dev):
++		self.dev=dev
++		self.handle_count=0
++		self.blocks=0
++		self.blocks_logged=0
++		self.tid=0
++		slef.wait=0
++		self.request_delay=0
++		self.running=0
++		self.locked=0
++		self.flushing=0
++		self.logging=0
++
++
++def stack_id_err(stack_id):
++	# -EFAULT in get_stackid normally means the stack-trace is not available,
++	# Such as getting kernel stack trace in userspace code
++	return (stack_id < 0) and (stack_id != -errno.EFAULT)
++
++def aksym(addr):
++	return b.ksym(addr) + "_[k]".encode()
++
++def print_stack(cpu, data, size):
++	k = ct.cast(data, ct.POINTER(HandleStat)).contents
++
++	kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id)
++
++	# fix kernel stack
++	kernel_stack = []
++	if k.kernel_stack_id >= 0:
++		for addr in kernel_tmp:
++			kernel_stack.append(addr)
++		# the later IP checking
++		if k.kernel_ip:
++			kernel_stack.insert(0, k.kernel_ip)
++
++	# print default multi-line stack output
++	if stack_id_err(k.kernel_stack_id):
++		print("    [Missed Kernel Stack]")
++	else:
++		print("comm: %s pid: %u tid: %lu pre_start_time: %lu run_time: %lu sched_delay: %lu" %
++			  (k.comm, k.pid, k.tid, (k.start_time - k.pre_start_time) / 1000,
++			  (k.end_time - k.pre_start_time) / 1000, k.sched_delay / 1000 / 1000))
++		for addr in kernel_stack:
++			print("    %s" % aksym(addr))
++	#print("    %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid))
++	#print()
++
++def print_header():
++	print("\n%-8s %-12s %-4s %-13s %-7s %-6s %-8s %-7s %-12s %-6s %-13s" % ("dev",
++		  "tid", "wait", "request_delay", "running", "locked", "flushing", "logging",
++		  "handle_count", "blocks", "blocks_logged"))
++
++# process event
++def print_event(cpu, data, size):
++	global num_trans
++
++	event = ct.cast(data, ct.POINTER(Data)).contents
++	if (num_trans % 10) == 0:
++		print_header()
++
++	print("%-8s %-12lu %-4lu %-13lu %-7lu %-6lu %-8lu %-7lu %-12lu %-6lu %-13lu" %
++		  (devid2name[event.dev], event.tid, event.wait, event.request_delay, event.running,
++		  event.locked, event.flushing, event.logging, event.handle_count, event.blocks,
++		  event.blocks_logged))
++	num_trans = num_trans + 1
++
++label="usecs"
++stack_traces = b["stack_traces"]
++b["jbd2_handle_stat"].open_perf_buffer(print_stack, page_cnt=64)
++b["transaction_stat"].open_perf_buffer(print_event, page_cnt=64)
++while 1:
++    try:
++        b.perf_buffer_poll()
++
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/alijbd2stat_example.txt b/tools/alijbd2stat_example.txt
+new file mode 100644
+index 0000000..a65f12e
+--- /dev/null
++++ b/tools/alijbd2stat_example.txt
+@@ -0,0 +1,83 @@
++Demonstrations of alijbd2stat.
++Alijbd2stat can trace every jbd2 handle and every transaction commit
++info. If one jbd2 handle's running time is greater than user specified
++threshold time, alijbd2stat will catch this handle's context.
++
++Usage:
++alijbd2stat [-h] [-d DEVICE] [-t [THRESH_TIME]]
++
++Summarize filesystem jbd2 stat
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -d DEVICE, --device DEVICE
++                        inspect specified device
++  -t [THRESH_TIME], --thresh_time [THRESH_TIME]
++                        show jbd2 handle's context when its run time is
++                        greater than this value
++
++examples:
++    ./alijbd2stat          # show jbd2 transaction commit info.
++    ./alijbd2stat -d sda3  # inspect specified device /dev/sda3.
++    ./alijbd2stat -t 10    # show jbd2 handle's context when it runs more than 10ms.
++
++
++Output example:
++
++Example 1:
++# Show jbd2 transaction commit info.
++python alijbd2stat.py
++
++dev      tid          wait request_delay running locked flushing logging handle_count blocks blocks_logged
++sda5     11122351     0    0             5052    0      0        0       61           34     13
++sda3     13641140     0    0             5280    0      0        0       138          19     20
++sda5     11122352     0    0             5041    0      0        0       59           35     14
++sda3     13641141     0    0             5000    0      0        0       113          9      10
++sda5     11122353     0    0             5069    0      0        0       91           47     18
++
++Example 2:
++# inspect specified device /dev/sda5
++python alijbd2stat.py  -d sda5
++
++dev      tid          wait request_delay running locked flushing logging handle_count blocks blocks_logged
++sda5     11122461     0    0             5029    0      0        0       75           38     15
++sda5     11122462     0    0             5107    0      0        0       74           42     17
++sda5     11122463     0    0             5028    0      0        1       88           50     17
++sda5     11122464     0    0             5128    0      0        0       71           40     19
++sda5     11122465     0    0             5243    0      0        0       83           44     13
++sda5     11122466     0    0             5176    0      0        0       90           63     20
++sda5     11122467     0    0             5029    0      0        0       60           34     13
++sda5     11122468     0    0             5028    0      0        1       79           38     19
++sda5     11122469     0    0             5030    0      0        0       44           27     12
++sda5     11122470     0    0             5614    0      0        0       106          62     25
++
++Example 3:
++# inspect specified device /dev/sda5, and show jbd2 handle's context when one
++# jbd2 handle runs more than 1ms.
++python alijbd2stat.py  -d sda5 -t 1
++
++comm: dd pid: 106212 tid: 11151230 pre_start_time: 0 run_time: 585432 sched_delay: 0
++    jbd2_journal_stop_[k]
++    jbd2_journal_stop_[k]
++    ext4_dirty_inode_[k]
++    __mark_inode_dirty_[k]
++    generic_write_end_[k]
++    ext4_da_write_end_[k]
++    generic_perform_write_[k]
++    __generic_file_write_iter_[k]
++    ext4_file_write_iter_[k]
++    __vfs_write_[k]
++    vfs_write_[k]
++    sys_write_[k]
++    do_syscall_64_[k]
++    entry_SYSCALL_64_after_swapgs_[k]
++comm: kworker/u128:3 pid: 83229 tid: 0 pre_start_time: 0 run_time: 589686 sched_delay: 0
++    jbd2_journal_stop_[k]
++    jbd2_journal_stop_[k]
++    ext4_convert_unwritten_extents_[k]
++    ext4_end_io_rsv_work_[k]
++    process_one_work_[k]
++    worker_thread_[k]
++    kthread_[k]
++    ret_from_fork_[k]
++sda5     11151229     0    0             5000    0      0        595     401233       255    36
+diff --git a/tools/alimutexsnoop.py b/tools/alimutexsnoop.py
+new file mode 100755
+index 0000000..d28ce05
+--- /dev/null
++++ b/tools/alimutexsnoop.py
+@@ -0,0 +1,273 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++from __future__ import print_function
++
++import argparse
++import ctypes as ct
++import os
++import platform
++import re
++import sys
++import time
++
++from bcc import BPF
++from bcc.utils import printb
++from datetime import datetime
++from time import strftime
++
++#
++# alimutexsnoop.py Trace all process that holds a mutex longer than a
++# specified time interval (500us, etc).
++#           For Linux, uses BCC, eBPF. Embedded C.
++#
++# USAGE: alimutexsnoop [-h] [-t] [-p PID]
++#
++# Author: Alvin Zheng
++_examples = """examples:
++    alimutexsnoop           # trace all mutexes that are held for a long time
++    alimutexsnoop -t 50     # set the time threshold (us), mutexes are held longer
++                              than this time threshold will be printed
++    alimutexsnoop -p 123    # only trace the specified process
++"""
++"""
++    Copyright (c) 2019 Alvin Zheng, Alibaba, Inc.
++    Licensed under the Apache License, Version 2.0 (the "License")
++"""
++
++def positive_int(val):
++    try:
++        ival = int(val)
++    except ValueError:
++        raise argparse.ArgumentTypeError("must be an integer")
++    if ival <= 0:
++        raise argparse.ArgumentTypeError("must be positive value")
++    return ival
++
++def _getParser():
++    parser = argparse.ArgumentParser(
++        description="Trace all mutexes that have been held for a long time",
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        epilog=_examples)
++    a = parser.add_argument
++    a("-t", "--threshold", type=positive_int, default=1000,
++      help="The mutexes held longer than this time threshold will be traced(us)")
++    a("-p", "--pid", type=positive_int,
++      help="trace this PID only")
++    return parser.parse_args
++
++
++class Global():
++    parse_args = _getParser()
++    args = None
++    argv = None
++    bpf = None
++
++class Data(ct.Structure):
++    """Event data matching struct data_t in _embedded_c()."""
++    _TASK_COMM_LEN = 16      # linux/sched.h
++    _pack_ = 1
++    _fields_ = [
++        ("start_time", ct.c_ulonglong),  # task->start_time
++        ("pid_tgid", ct.c_ulonglong),
++        ("pid", ct.c_uint),  # task->tgid, thread group id == sys_getpid()
++        ("mutex_addr", ct.c_ulonglong),
++        ("k_stack_id", ct.c_int),
++        ("u_stack_id", ct.c_int),
++        ("task", ct.c_char * _TASK_COMM_LEN)
++    ]
++
++def _embedded_c(args):
++    """Generate C programs for mutex_lock and mutex_unlock."""
++    c = """
++    EBPF_COMMENT
++    #include <linux/sched.h>
++    BPF_STATIC_ASSERT_DEF
++    BPF_STACK_TRACE(stack_traces,655360);
++    struct key_t{
++        u64 pid_tgid;
++        u64 mutex_addr;
++    } __attribute__((packed));
++    BPF_HASH(held_mutexes, struct key_t,u64);
++    struct data_t{
++        u64 start_time;
++        u64 pid_tgid;
++        u32 pid;
++        u64 mutex_addr;
++        u32 k_stack_id;
++        u32 u_stack_id;
++        char task[TASK_COMM_LEN];
++    } __attribute__((packed));
++    BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA);
++    BPF_PERF_OUTPUT(results);
++
++    int kprobe_ret_mutex_lock(struct pt_regs * ctx,struct mutex * lock)
++    {
++        struct task_struct *task = (typeof(task))bpf_get_current_task();
++        FILTER_SELF_LOCK
++        if (FILTER_PID) { return 0; }
++        struct key_t key = {
++            .pid_tgid=bpf_get_current_pid_tgid(),
++            .mutex_addr=(u64)((void*)lock)
++        };
++        u64 locktime = bpf_ktime_get_ns();
++        u64 * ptr=held_mutexes.lookup_or_init(&key, &locktime);
++        if(!ptr){
++            bpf_trace_printk("could not add held_mutexes, thread: %d, mutex: %p \\n" ,task->tgid,lock);
++            return 1;
++        }
++        return 0;
++    }
++    int kprobe_mutex_unlock(struct pt_regs * ctx, struct mutex * lock)
++    {
++        struct task_struct *task = (typeof(task))bpf_get_current_task();
++        FILTER_SELF_UNLOCK
++        if (FILTER_PID) { return 0; }
++        struct key_t key = {
++            .pid_tgid=bpf_get_current_pid_tgid(),
++            .mutex_addr=(u64)((void*)lock)
++        };
++        u64 cur = bpf_ktime_get_ns();
++        u64 * locktime=held_mutexes.lookup(&key);
++        if(!locktime){
++            bpf_trace_printk("cannot find the lock in thread: %d mutex: %p \\n",task->tgid,lock);
++            return 1;
++        }
++        if(cur - (*locktime) > TIME_THRESHOLD){
++            struct data_t data = {
++                .start_time=task->start_time,
++                .pid_tgid=bpf_get_current_pid_tgid(),
++                .pid=task->pid,
++                .mutex_addr=(u64)((void*)lock)
++            };
++            bpf_get_current_comm(&data.task,sizeof(data.task));
++            data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID);
++            data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID|BPF_F_USER_STACK);
++            results.perf_submit(ctx,&data,sizeof(data));
++        }
++        held_mutexes.delete(&key);
++        return 0;
++    }
++    """
++    # TODO: this macro belongs in bcc/src/cc/export/helpers.h
++    bpf_static_assert_def = r"""
++    #ifndef BPF_STATIC_ASSERT
++    #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \
++    extern int bpf_static_assert[(condition) ? 1 : -1]
++    #endif
++    """
++    selfpid = os.getpid()
++    filter_self = r"""
++    if(task->tgid==%d){
++        return 0;
++    }
++    """ % selfpid
++    code_substitutions = [
++        ('EBPF_COMMENT', _ebpf_comment()),
++        ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def),
++        ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))),
++        ("TIME_THRESHOLD", str(Global.args.threshold * 1000)),  # ns->us
++        ("FILTER_SELF_LOCK", filter_self),
++        ("FILTER_SELF_UNLOCK", filter_self),
++        ('FILTER_PID', '0' if not Global.args.pid else "task->pid != %d" % Global.args.pid)
++    ]
++    for old, new in code_substitutions:
++        c = c.replace(old, new)
++    # print(c)
++    return c
++
++def _ebpf_comment():
++    """Return a C-style comment with information about the generated code."""
++    comment = ('Created by %s at %s:\n\t%s' %
++                    (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__))
++    args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n')
++    return ("\n   /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" %
++                             (comment, ' '.join(Global.argv), args))
++                   .replace('\n', '\n\t*').replace('\t', '    '))
++
++def _print_header():
++    print("%-16s %-6s  %-16s \n" % ("PCOMM", "PID", "MUTEX_ADDR"))
++
++def _print_event(cpu, data, size):  # callback
++    """Print the long-held mutexes."""
++    stack_traces = Global.bpf['stack_traces']
++    e = ct.cast(data, ct.POINTER(Data)).contents
++    print("%-16s %-6d  %-16x\n" %
++              (e.task.decode(), e.pid, e.mutex_addr), end="")
++    # print the kernel stack
++    if e.k_stack_id >= 0:
++        cur_stack = reversed(list(stack_traces.walk(e.k_stack_id)))
++        for addr in cur_stack:
++            printb(b"Kernel Stack:    %-16x  %s" % (addr, Global.bpf.ksym(addr)))
++    else:
++        print("No KERNEL STACK FOUND")
++    # print the user stack
++    if e.u_stack_id >= 0:
++        cur_stack = reversed(list(stack_traces.walk(e.u_stack_id)))
++        for addr in cur_stack:
++            printb(b"USER Stack:      %-16x  %s" % (addr, Global.bpf.sym(addr, e.pid_tgid)))
++    else:
++        print("No USER STACK FOUND")
++    print("=================================")
++    print()
++# =============================
++# Module: These functions are available for import
++# =============================
++def initialize(arg_list=sys.argv[1:]):
++    """Trace all long-held mutexes
++
++    arg_list - list of args, if omitted then uses command line args
++               arg_list is passed to argparse.ArgumentParser.parse_args()
++
++    Returns a tuple (return_code, result)
++       0 = Ok, result is the return value from BPF()
++       1 = args.ebpf is requested, result is the generated C code
++       os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message
++       os.EX_SOFTWARE: internal software error, result is error message
++    """
++    Global.argv = arg_list
++    Global.args = Global.parse_args(arg_list)
++    if os.geteuid() != 0:
++        return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call")
++    c = _embedded_c(Global.args)
++    try:
++        return (os.EX_OK, BPF(text=c))
++    except Exception as e:
++        error = format(e)
++        return (os.EX_SOFTWARE, "BPF error: " + error)
++
++def snoop(bpf, event_handler):
++    """Call event_handler when release a lock that is held for a long time
++    bpf - result returned by successful initialize()
++    event_handler - callback function to handle termination event
++    args.pid - Return after event_handler is called, only monitoring this pid
++    """
++    bpf["results"].open_perf_buffer(event_handler)
++    while True:
++        bpf.perf_buffer_poll()
++        if Global.args.pid:
++            if not os.path.exists('/proc/%d' % Global.args.pid):
++                # the target process has exited
++                break
++
++# =============================
++# Script: invoked as a script
++# =============================
++def main():
++    try:
++        rc, buffer = initialize()
++        if rc != os.EX_OK:
++            print(buffer)
++            sys.exit(-1)
++        Global.bpf = buffer
++        Global.bpf.attach_kretprobe(event="mutex_lock", fn_name="kprobe_ret_mutex_lock")
++        Global.bpf.attach_kprobe(event="mutex_unlock", fn_name="kprobe_mutex_unlock")
++        _print_header()
++        snoop(buffer, _print_event)
++    except KeyboardInterrupt:
++        print()
++        sys.exit()
++
++    return 0
++
++if __name__ == '__main__':
++    main()
+diff --git a/tools/alimutexsnoop_example.txt b/tools/alimutexsnoop_example.txt
+new file mode 100644
+index 0000000..67ca03e
+--- /dev/null
++++ b/tools/alimutexsnoop_example.txt
+@@ -0,0 +1,79 @@
++Demonstrations of alimutexsnoop.
++
++This profile tool monitors the mutex that is held by the process longer than
++a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes,
++it prints the pid of the process that held the mutex and the address of the
++mutex. In addition, alimutexsnoop also prints the user and kernel stack of the
++process.
++
++
++Usage:
++alimutexsnoop.py [-h] [-t THRESHOLD] [-p PID]
++
++Trace all mutexes that have been held for a long time
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -t THRESHOLD, --threshold THRESHOLD
++                        The mutexes held longer than this time threshold will
++                        be traced(us)
++  -p PID, --pid PID     trace this PID only
++
++examples:
++    alimutexsnoop           # trace all mutexes that are held for a long time
++    alimutexsnoop -t 50     # set the time threshold (us), mutexes are held longer
++                              than this time threshold will be printed
++    alimutexsnoop -p 123    # only trace the specified process
++
++
++Output example:
++
++python alimutexsnoop.py -t 20 # monitor the mutexes that are held by processes longer than 20 us
++
++PCOMM            PID     MUTEX_ADDR
++
++AliYunDun        4283    ffffa30fd1b4ea48
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa52316fc  ksys_read
++Kernel Stack:    ffffffffa57cfd21  mutex_unlock
++USER Stack:      7f1532c4e6fd      read
++=================================
++
++AliYunDun        4283    ffffa310d7e43c40
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa52316ba  ksys_read
++Kernel Stack:    ffffffffa5231149  vfs_read
++Kernel Stack:    ffffffffa5230f86  __vfs_read
++Kernel Stack:    ffffffffa5254b30  seq_read
++Kernel Stack:    ffffffffa57cfd21  mutex_unlock
++USER Stack:      7f1532c4e6fd      read
++=================================
++
++AliYunDun        4283    ffffa30fd1b4e348
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa52316fc  ksys_read
++Kernel Stack:    ffffffffa57cfd21  mutex_unlock
++USER Stack:      7f1532c4e6fd      read
++=================================
++
++AliYunDunUpdate  4256    ffffa310d1146a40
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa52316ba  ksys_read
++Kernel Stack:    ffffffffa5231149  vfs_read
++Kernel Stack:    ffffffffa5230f86  __vfs_read
++Kernel Stack:    ffffffffa5254b30  seq_read
++Kernel Stack:    ffffffffa57cfd21  mutex_unlock
++USER Stack:      7fbce69966fd      read
++=================================
++
++AliYunDunUpdate  4256    ffffa310d3b48a48
++Kernel Stack:    ffffffffa5800088  entry_SYSCALL_64_after_hwframe
++Kernel Stack:    ffffffffa500201b  do_syscall_64
++Kernel Stack:    ffffffffa52316fc  ksys_read
++Kernel Stack:    ffffffffa57cfd21  mutex_unlock
++USER Stack:      7fbce69966fd      read
++=================================
+diff --git a/tools/alirunqinfo.py b/tools/alirunqinfo.py
+new file mode 100755
+index 0000000..102c763
+--- /dev/null
++++ b/tools/alirunqinfo.py
+@@ -0,0 +1,360 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alirunqinfo  Summarize task on-CPU time before being descheduled.
++#              For Linux, uses BCC, eBPF.
++#
++# USAGE: alirunqinfo [-h] [-T] [-N] [-D] [-C] [-P] [-c CPU] [-p PID]
++#                    [interval] [outputs]
++#
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++from time import sleep, strftime
++import argparse
++import ctypes
++import signal
++
++# arguments
++examples = """examples:
++    ./alirunqinfo            # sum runqueue statistics
++    ./alirunqinfo -C         # show per-CPU runqueue statistics
++    ./alirunqinfo -P         # show per-thread runqueue statistics
++    ./alirunqinfo -D         # show runqueue statistics as histograms
++    ./alirunqinfo -c 0       # show runqueue statistics of CPU 0 only
++    ./alirunqinfo -p 25      # show runqueue statistics of PID 25 only
++    ./alirunqinfo 1 10       # print 1 second summaries, 10 times
++    ./alirunqinfo -NT 1      # 1s summaries, nanoseconds, and timestamps
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize runqueue statistics.",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-T", "--timestamp", action="store_true",
++    help="include timestamp on output")
++parser.add_argument("-N", "--nanoseconds", action="store_true",
++    help="output in nanoseconds")
++parser.add_argument("-C", "--cpus", action="store_true",
++    help="show per-CPU runqueue statistics")
++parser.add_argument("-P", "--pids", action="store_true",
++    help="show per-thread runqueue statistics")
++parser.add_argument("-D", "--dist", action="store_true",
++    help="show distributions as histograms")
++parser.add_argument("-c", "--cpu",
++    help="show runqueue statistics on specific CPU only")
++parser.add_argument("-p", "--pid",
++    help="show runqueue statistics of specific PID only")
++parser.add_argument("interval", nargs="?", default=99999999,
++    help="output interval, in seconds")
++parser.add_argument("outputs", nargs="?", default=99999999,
++    help="number of outputs")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++countdown = int(args.outputs)
++if args.pid:
++    args.pids = True
++if args.cpu:
++    args.cpus = True
++if (not args.cpus) and (not args.pids):
++    args.cpus = True
++
++if args.nanoseconds:
++    factor = 1
++    label = "nsecs"
++else:
++    factor = 1000
++    label = "usecs"
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++typedef struct time_stamp {
++    u64 stamp;
++    u32 pid;
++} time_stamp_t;
++
++BPF_PERCPU_ARRAY(start, time_stamp_t, 1);
++
++STORAGE
++
++
++static inline int sched_switch_exit(u32 cpu, u32 pid, u64 now, time_stamp_t *new) {
++    PID_FILTER
++
++    u32 idx = 0;
++    // fetch timestamp and calculate delta
++    time_stamp_t *stampp = start.lookup(&idx);
++    if (!stampp || stampp->pid != pid) {
++        return 0;   // missed start
++    }
++    u64 delta = now - stampp->stamp;
++    start.update(&idx, new);
++
++    UPDATE
++
++    return 0;
++}
++
++static inline int sched_switch_enter(u32 pid, u64 now, time_stamp_t *new) {
++    PID_FILTER
++
++    u32 idx = 0;
++    new->stamp = now;
++    new->pid = pid;
++
++    start.update(&idx, new);
++    return 0;
++}
++
++TRACEPOINT_PROBE(sched, sched_switch)
++{
++    u32 cpu = bpf_get_smp_processor_id();
++    CPU_FILTER
++
++    u64 now = bpf_ktime_get_ns();
++    u32 pid, idx = 0;
++    time_stamp_t new = {};
++
++    pid = args->prev_pid;
++    if (pid != 0) {
++        sched_switch_exit(cpu, pid, now, &new);
++    }
++
++    pid = args->next_pid;
++    if (pid != 0) {
++        sched_switch_enter(pid, now, &new);
++    }
++
++    return 0;
++}
++"""
++
++
++# replace STORAGE
++map_key = """
++typedef struct map_key {
++    u32 cpu;
++    u32 pid;
++} map_key_t;
++"""
++
++map_val = """
++typedef struct map_val {
++    u64 time;
++    u64 count;
++} map_val_t;
++"""
++
++if args.dist:
++    # diagram mode, we need BPF_HISTOGRAM
++    if args.cpus and args.pids:
++        storage = map_key + """
++typedef struct map_key_slot {
++    map_key_t key;
++    u64 slot;
++} map_key_slot_t;
++"""
++    else:
++        storage = """
++typedef struct map_key_slot {
++    u64 key;
++    u64 slot;
++} map_key_slot_t;
++"""
++    storage += """BPF_HISTOGRAM(dist, map_key_slot_t);"""
++
++else:
++    # normal mode, we need BPF_HASH
++    storage = map_val
++    if args.cpus and args.pids:
++        storage += map_key + """BPF_HASH(res, map_key_t, map_val_t);"""
++    else:
++        storage += """BPF_HASH(res, u32, map_val_t);"""
++
++bpf_text = bpf_text.replace('STORAGE', storage)
++
++
++# replace UPDATE
++if args.dist:
++    if args.cpus and args.pids:
++        key_exp = "{.cpu = cpu, .pid = pid,}"
++    elif args.cpus:
++        key_exp = "cpu"
++    elif args.pids:
++        key_exp = "pid"
++
++    update = """
++    map_key_slot_t key_slot = {
++        .key = %s,
++        .slot = bpf_log2l(delta / FACTOR),
++    };
++    dist.increment(key_slot);
++""" % key_exp
++
++else:
++    if args.cpus and args.pids:
++        update = """
++    map_key_t key = {
++        .cpu = cpu,
++        .pid = pid,
++    };
++"""
++    elif args.cpus:
++        update = "u32 key = cpu;"
++    elif args.pids:
++        update = "u32 key = pid;"
++
++    update += """
++    map_val_t *valp = res.lookup(&key);
++    if (valp) {
++        valp->count += 1;
++        valp->time += delta;
++    }
++    else {
++        map_val_t val = {
++            .count = 1,
++            .time  = delta,
++        };
++        res.update(&key, &val);
++    }
++"""
++bpf_text = bpf_text.replace('UPDATE', update)
++bpf_text = bpf_text.replace('FACTOR', str(factor))
++
++
++# replace FILTER
++cpu_filter = "if (cpu != %s) {return 0;}" % args.cpu if args.cpu else ""
++bpf_text = bpf_text.replace('CPU_FILTER', cpu_filter)
++
++pid_filter = "if (pid != %s) {return 0;}" % args.pid if args.pid else ""
++bpf_text = bpf_text.replace('PID_FILTER', pid_filter)
++
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++# load BPF program
++b = BPF(text=bpf_text)
++
++# print header
++print("Tracing runqueue statistics", end="")
++if args.pid:
++    print(" of PID %s" % args.pid, end="")
++if args.cpu:
++    print(" on CPU %s" % args.cpu, end="")
++if args.dist:
++    print(", in diagram mode", end="")
++print("...Hit Ctrl-C to end.")
++
++if args.dist:
++    tab = b.get_table("dist")
++else:
++    tab = b.get_table("res")
++
++
++def pid_to_comm(pid):
++    try:
++        return open("/proc/%d/comm" % pid, "r").read().rstrip("\n")
++    except IOError:
++        return ""
++
++
++def print_per_cpu_thread(tab):
++    cur_cpu = -1
++    # k is struct map_key; v is struct map_val
++    for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].pid)):
++        if cur_cpu != k.cpu:
++            print("\nCPU%d:" % k.cpu)
++            print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label))
++            cur_cpu = k.cpu
++        print("%10d %40s %10d %10d" % (k.pid, pid_to_comm(k.pid), v.count, v.time / factor))
++
++
++def print_per_cpu(tab):
++    print("%10s %10s %10s" % ("CPU", "Count", "TOTAL_" + label))
++    # k is c_uint (representing CPU), v is struct map_val
++    for k, v in sorted(tab.items(), key=lambda res: res[0].value):
++        print("%10d %10d %10d" % (k.value, v.count, v.time / factor))
++
++
++def print_per_thread(tab):
++    print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label))
++    # k is c_uint (representing PID), v is struct map_val
++    for k, v in sorted(tab.items(), key=lambda res: res[0].value):
++        print("%10d %40s %10d %10d" % (k.value, pid_to_comm(k.value), v.count, v.time / factor))
++
++
++def print_dist_per_cpu_thread(tab):
++    tab.print_log2_hist("time_" + label, "section",
++            # k is (cpuN, pidN) formatted by bucket_fn
++            section_print_fn=lambda k: ("CPU %d, PID %d (%s)" % (k[0], k[1], pid_to_comm(k[1]))),
++            bucket_fn=lambda k: (k.cpu, k.pid),  # k is struct map_key
++            bucket_sort_fn=lambda k: sorted(k))  # k is struct map_key
++
++
++def print_dist_per_cpu(tab):
++    tab.print_log2_hist("time_" + label, "CPU",
++            section_print_fn=lambda k: str(k),   # k is u64 representing CPU
++            bucket_sort_fn=lambda k: sorted(k))  # k is u64 representing CPU
++
++
++def print_dist_per_thread(tab):
++    tab.print_log2_hist("time_" + label, "Thread",
++            section_print_fn=lambda k: ("%d (%s)" % (k, pid_to_comm(k))),  # k is u64 representing PID
++            bucket_sort_fn=lambda k: sorted(k))  # k is u64 representing PID
++
++
++# signal handler
++def signal_ignore(signal, frame):
++    print()
++
++
++# output
++while (1):
++    try:
++        sleep(int(args.interval))
++    except KeyboardInterrupt:
++        # as cleanup can take many seconds, trap Ctrl-C:
++        signal.signal(signal.SIGINT, signal_ignore)
++        countdown = 1
++
++    print()
++    if args.timestamp:
++        print("%-8s\n" % strftime("%H:%M:%S"), end="")
++
++    if args.dist:
++        if args.cpus and args.pids:
++            print_dist_per_cpu_thread(tab)
++        elif args.cpus:
++            print_dist_per_cpu(tab)
++        elif args.pids:
++            print_dist_per_thread(tab)
++        else:
++            print("Invalid parameter")
++            countdown = 1
++    else:
++        if args.cpus and args.pids:
++            print_per_cpu_thread(tab)
++        elif args.cpus:
++            print_per_cpu(tab)
++        elif args.pids:
++            print_per_thread(tab)
++        else:
++            print("Invalid parameter")
++            countdown = 1
++
++    tab.clear()
++
++    countdown -= 1
++    if countdown == 0:
++        exit()
+diff --git a/tools/alirunqinfo_example.txt b/tools/alirunqinfo_example.txt
+new file mode 100644
+index 0000000..a1c24ca
+--- /dev/null
++++ b/tools/alirunqinfo_example.txt
+@@ -0,0 +1,203 @@
++This program summarize runqueue statistics, that is, task on-CPU time
++before being descheduled, in flavor of per-CPU, per-PID, or histogram.
++
++There are major two categories of parameters:
++1. one category of parameters control the sampling source, such as
++-c[--cpu] and -p[--pid]
++2. another category of parameters control the displaying format,
++such as -C[--cpus], -P[--pids] and -D[--dist]
++
++These two categories of parameters are Orthogonal.
++
++
++Example 1.
++
++By default, the program will sample all processes on all runqueues
++of all CPUs. Statistics of all processes on the same CPU is
++aggregated and displayed in a per-CPU flavor.
++
++In this case, "alirunqinfo" is the same as "alirunqinfo -C".
++
++# alirunqinfo
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++       CPU      Count TOTAL_usecs
++         0         77        207
++         1          9         27
++         2          3         14
++         3          2         15
++         4         45        122
++         6         41        380
++         7          6         25
++         9         18         61
++        10         19         50
++        12          2         19
++        14          2        189
++        16          1         13
++        17         18         49
++        20         36         73
++        21         18         35
++        22         22        109
++        24         25         92
++        28         21         68
++        30          4         95
++
++
++Example 2.
++
++Make the output displayed in a per-process flavor with "-P" parameter.
++In this case, all processes on all CPUs are sampled as usual, while
++all statistics of the same process is aggregated, whichever CPU it
++runs on.
++
++# alirunqinfo -P
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++       PID                                   Thread      Count TOTAL_usecs
++        10                                rcu_sched        122        264
++       382                kworker/14:1-mm_percpu_wq          1         20
++      2926                                     rngd          1          2
++      3327                kworker/16:2-mm_percpu_wq          1         15
++      3384                             in:imjournal          2          8
++      3428                          AliYunDunUpdate         20         60
++      3429                          AliYunDunUpdate         20         40
++      3442                           aliyun-service         20         98
++      3449                                AliYunDun         20        137
++      3450                                AliYunDun         19         58
++      3451                                AliYunDun         20         56
++      3462                                AliYunDun         19         46
++      3463                                AliYunDun         19        100
++      3464                                AliYunDun          2        220
++      3467                                AliYunDun          2          6
++      3468                                AliYunDun         26         61
++      3469                                AliYunDun         39         78
++      3470                                AliYunDun         39        125
++      3471                                AliYunDun          2         10
++      3473                                AliYunDun          2          4
++      3474                                AliYunDun          1          4
++      3475                                AliYunDun          2         11
++      3476                                AliYunDun         10         30
++      3533                                     sshd          5        193
++      4573                kworker/27:2-mm_percpu_wq          2         13
++      5016                kworker/24:0-mm_percpu_wq          1          5
++      5281       kworker/4:0-events_power_efficient          9         19
++      5999       kworker/0:2-events_power_efficient          2         34
++      6429             kworker/u64:0-events_unbound          3         14
++      6468             kworker/u64:1-events_unbound          2         37
++
++
++Example 3.
++
++You can set "-P" and "-C" parameter at the same time, in which case, the
++output is displayed in a per-CPU and per-process flavor.
++
++# alirunqinfo -PC
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++
++CPU0:
++       PID                                   Thread      Count TOTAL_usecs
++        10                                rcu_sched         72        147
++      3467                                AliYunDun          1          6
++
++CPU1:
++       PID                                   Thread      Count TOTAL_usecs
++      3476                                AliYunDun          5         26
++
++CPU2:
++       PID                                   Thread      Count TOTAL_usecs
++      3384                             in:imjournal          2         12
++
++CPU3:
++       PID                                   Thread      Count TOTAL_usecs
++      2926                                     rngd          2          7
++
++
++Example 4.
++
++Sample specified process only with "-p", and "-P" is added implicitly
++when "-p" is specified.
++
++# alirunqinfo -p 3429
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++       PID                                   Thread      Count TOTAL_usecs
++      3429                          AliYunDunUpdate         14         40
++
++
++Example 5.
++
++When both "-p" and "-C" parameters are specified, the output is displayed
++in per-CPU and per-process flavor.
++
++# alirunqinfo -p 3429 -C
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++
++CPU22:
++       PID                                   Thread      Count TOTAL_usecs
++      3429                          AliYunDunUpdate         29         67
++
++
++Example 6.
++
++Sample processes on specified CPU only with "-c", and "-C" is added
++implicitly when "-c" is specified.
++
++# alirunqinfo -c 22
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++       CPU      Count TOTAL_usecs
++        22         24         90
++
++
++Example 7.
++
++Make the output displayed in a diagram when "-D" is specified.
++
++# alirunqinfo -p 3429 -D
++Tracing runqueue statistics... Hit Ctrl-C to end.
++^C
++
++Thread = 3429 (AliYunDunUpdate)
++     time_usecs          : count     distribution
++         0 -> 1          : 22       |****************************************|
++         2 -> 3          : 12       |*********************                   |
++         4 -> 7          : 0        |                                        |
++         8 -> 15         : 0        |                                        |
++        16 -> 31         : 0        |                                        |
++        32 -> 63         : 1        |*                                       |
++
++
++
++USAGE message:
++
++# alirunqinfo -h
++usage: alirunqinfo.py [-h] [-T] [-N] [-C] [-P] [-D] [-c CPU] [-p PID]
++                   [interval] [outputs]
++
++Summarize runqueue statistics, including total count, time, etc.
++
++positional arguments:
++  interval           output interval, in seconds
++  outputs            number of outputs
++
++optional arguments:
++  -h, --help         show this help message and exit
++  -T, --timestamp    include timestamp on output
++  -N, --nanoseconds  output in nanoseconds
++  -C, --cpus         show per-CPU runqueue statistics
++  -P, --pids         show per-thread runqueue statistics
++  -D, --dist         show distributions as histograms
++  -c CPU, --cpu CPU  show runqueue statistics on specific CPU only
++  -p PID, --pid PID  show runqueue statistics of specific PID only
++
++examples:
++    ./alirunqinfo            # sum runqueue statistics
++    ./alirunqinfo -C         # show per-CPU runqueue statistics
++    ./alirunqinfo -P         # show per-thread runqueue statistics
++    ./alirunqinfo -D         # show runqueue statistics as histograms
++    ./alirunqinfo -c 0       # show runqueue statistics of CPU 0 only
++    ./alirunqinfo -p 25      # show runqueue statistics of PID 25 only
++    ./alirunqinfo 1 10       # print 1 second summaries, 10 times
++    ./alirunqinfo -NT 1      # 1s summaries, nanoseconds, and timestamps
+diff --git a/tools/alisoftirqs.py b/tools/alisoftirqs.py
+new file mode 100755
+index 0000000..202114f
+--- /dev/null
++++ b/tools/alisoftirqs.py
+@@ -0,0 +1,215 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alisoftirqs   Summarize soft IRQ (interrupt) event time.
++#               Enhanced version of BCC softirqs.
++#               For Linux, uses BCC, eBPF.
++#
++# USAGE: alisoftirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs]
++#
++# Copyright (c) 2015 Brendan Gregg.
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++from time import sleep, strftime
++import argparse
++
++# arguments
++examples = """examples:
++    ./alisoftirqs            # sum soft irq event time
++    ./alisoftirqs -D         # show soft irq event time as histograms
++    ./alisoftirqs -c 0       # show soft irq event time of CPU 0 only
++    ./alisoftirqs -i 25      # show event time of IRQ 25 only
++    ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
++    ./alisoftirqs 1 10       # print 1 second summaries, 10 times
++    ./alisoftirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize soft irq statistics, including total count, time, etc.",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-T", "--timestamp", action="store_true",
++    help="include timestamp on output")
++parser.add_argument("-N", "--nanoseconds", action="store_true",
++    help="output in nanoseconds")
++parser.add_argument("-D", "--dist", action="store_true",
++    help="show distributions as histograms")
++parser.add_argument("-c", "--cpu",
++    help="output softIRQ statistics on specific CPU only")
++parser.add_argument("-i", "--irq",
++    help="output statistics of specific softIRQ")
++parser.add_argument("interval", nargs="?", default=99999999,
++    help="output interval, in seconds")
++parser.add_argument("outputs", nargs="?", default=99999999,
++    help="number of outputs")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++countdown = int(args.outputs)
++distmode = "1" if args.dist else "0"
++if args.nanoseconds:
++    factor = 1
++    label = "nsecs"
++else:
++    factor = 1000
++    label = "usecs"
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++typedef struct irq_key {
++    u32 cpu;
++    u32 irq;
++} irq_key_t;
++
++typedef struct irq_key_slot {
++    irq_key_t key;
++    u64 slot;
++} irq_key_slot_t;
++
++typedef struct irq_val {
++    u64 count;
++    u64 time;
++    u64 time_max;
++} irq_val_t;
++
++BPF_PERCPU_ARRAY(start, u64, 1);
++BPF_HASH(res, irq_key_t, irq_val_t);
++BPF_HISTOGRAM(dist, irq_key_slot_t);
++
++
++TRACEPOINT_PROBE(irq, softirq_entry)
++{
++    COND_FILTER
++
++    u32 idx = 0;
++    u64 ts = bpf_ktime_get_ns();
++    start.update(&idx, &ts);
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(irq, softirq_exit)
++{
++    u64 *tsp, delta;
++    u32 cpu = bpf_get_smp_processor_id();
++    u32 irq = args->vec;
++
++    u32 idx = 0;
++    // fetch timestamp and calculate delta
++    tsp = start.lookup(&idx);
++    if (!tsp || *tsp == 0) {
++        return 0;   // missed start
++    }
++
++    delta = bpf_ktime_get_ns() - *tsp;
++    irq_key_t key = {.cpu = cpu, .irq = irq};
++
++#if DIST_MODE
++    irq_key_slot_t key_slot = {
++        .key = key,
++        .slot = bpf_log2l(delta / FACTOR),
++    };
++    dist.increment(key_slot);
++#else
++    struct irq_val *valp, val;
++    valp = res.lookup(&key);
++
++    if (valp) {
++        valp->count += 1;
++        valp->time += delta;
++        if (valp->time_max < delta) {valp->time_max = delta;}
++    }
++    else {
++        val.count = 1;
++        val.time = val.time_max = delta;
++        res.update(&key, &val);
++    }
++#endif
++    u64 zero = 0;
++    start.update(&idx, &zero);
++
++    return 0;
++}
++"""
++
++cond_filter = ''
++cpu_filter = "1"
++if args.cpu:
++    cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu
++
++irq_filter = "1"
++if args.irq:
++    irq_filter = "args->vec == %s" % args.irq
++
++cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter)
++bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
++
++bpf_text = bpf_text.replace('FACTOR', "%d" % factor)
++bpf_text = bpf_text.replace('DIST_MODE', distmode)
++
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++def vec_to_name(vec):
++    # copied from softirq_to_name() in kernel/softirq.c
++    # may need updates if new softirq handlers are added
++    return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
++            "tasklet", "sched", "hrtimer", "rcu"][vec]
++
++def print_section(key):
++    return "CPU %d, softIRQ %d (%s)" % (key[0], key[1], vec_to_name(key[1]))
++
++
++# load BPF program
++b = BPF(text=bpf_text)
++print("Tracing soft irq event time... Hit Ctrl-C to end.")
++
++if args.dist:
++    tab = b.get_table("dist")
++else:
++    tab = b.get_table("res")
++
++
++# output
++while (1):
++    try:
++        sleep(int(args.interval))
++    except KeyboardInterrupt:
++        countdown = 1
++    print()
++    if args.timestamp:
++        print("%-8s\n" % strftime("%H:%M:%S"), end="")
++
++    if args.dist:
++        tab.print_log2_hist("time_" + label, "section",
++            section_print_fn=print_section,
++            bucket_fn=lambda k: (k.cpu, k.irq),
++            bucket_sort_fn=lambda keys: sorted(keys))
++    else:
++        cur_cpu = -1
++        for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)):
++            if cur_cpu != k.cpu:
++                print("\nCPU%d:" % k.cpu)
++                print("%10s %20s %10s %10s %10s" %
++                    ("SOFTIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label))
++                cur_cpu = k.cpu
++
++            print("%10d %20s %10d %10d %10d" %
++                (k.irq, vec_to_name(k.irq), v.count, v.time / factor, v.time_max / factor))
++
++        tab.clear()
++
++    countdown -= 1
++    if countdown == 0:
++        exit()
+diff --git a/tools/alisoftirqs_example.txt b/tools/alisoftirqs_example.txt
+new file mode 100644
+index 0000000..48e0023
+--- /dev/null
++++ b/tools/alisoftirqs_example.txt
+@@ -0,0 +1,128 @@
++Demonstrations of alisoftirqs, the enhanced version of BCC softirqs.
++
++This program traces soft interrupts (irqs), and stores timing statistics
++in-kernel for efficiency.
++
++
++Example 1
++
++By default, all statistics of all softirq on all CPUs are sampled,
++and the output is displayed in a per-softirq and per-CPU flavor.
++
++# alisoftirqs
++Tracing soft irq event time... Hit Ctrl-C to end.
++^C
++
++CPU0:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         7                sched          1         59         59
++
++CPU3:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer          1          0          0
++         3               net_rx          6         29          9
++         7                sched          1          1          1
++         9                  rcu          1          0          0
++
++CPU4:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer         29         11          1
++         7                sched         11         26          4
++         9                  rcu         19         22          5
++
++
++Example 2
++
++You can sample statistics on specified CPU only with "-c" option,
++and the output will be displayed in a per-softirq flavor.
++
++# alisoftirqs -c 6
++Tracing soft irq event time... Hit Ctrl-C to end.
++^C
++
++CPU6:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer          7          2          0
++         7                sched          2          9          8
++         9                  rcu          5         10          5
++
++
++Exampel 3
++
++You can sample statistics of specified softirq only with "-i" option,
++and the output will be displayed in a per-CPU flavor.
++
++# alisoftirqs -i 1
++Tracing soft irq event time... Hit Ctrl-C to end.
++^C
++
++CPU0:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer          1          2          2
++
++CPU4:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer         26         11          2
++
++CPU7:
++   SOFTIRQ           IRQHandler      Count TOTAL_usecs  MAX_usecs
++         1                timer          2          5          4
++
++
++
++Example 4
++
++The distribution of softirq run time can be printed as a histogram with "-D"
++option. The output will be displayed in a per-softirq and per-CPU flavor.
++
++# alisoftirqs -D
++Tracing soft irq event time... Hit Ctrl-C to end.
++^C
++
++section = CPU 3, softIRQ 3 (net_rx)
++     time_usecs          : count     distribution
++         0 -> 1          : 0        |                                        |
++         2 -> 3          : 2        |****************************************|
++         4 -> 7          : 0        |                                        |
++         8 -> 15         : 1        |********************                    |
++
++section = CPU 4, softIRQ 1 (timer)
++     time_usecs          : count     distribution
++         0 -> 1          : 13       |****************************************|
++
++section = CPU 4, softIRQ 7 (sched)
++     time_usecs          : count     distribution
++         0 -> 1          : 2        |****************************************|
++         2 -> 3          : 1        |********************                    |
++         4 -> 7          : 2        |****************************************|
++
++
++
++USAGE message:
++
++# alisoftirqs -h
++usage: alisoftirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ]
++                       [interval] [outputs]
++
++Summarize soft irq statistics, including total count, time, etc.
++
++positional arguments:
++  interval           output interval, in seconds
++  outputs            number of outputs
++
++optional arguments:
++  -h, --help         show this help message and exit
++  -T, --timestamp    include timestamp on output
++  -N, --nanoseconds  output in nanoseconds
++  -D, --dist         show distributions as histograms
++  -c CPU, --cpu CPU  output softIRQ statistics on specific CPU only
++  -i IRQ, --irq IRQ  output statistics of specific softIRQ
++
++examples:
++    ./alisoftirqs            # sum soft irq event time
++    ./alisoftirqs -D         # show soft irq event time as histograms
++    ./alisoftirqs -c 0       # show soft irq event time of CPU 0 only
++    ./alisoftirqs -i 25      # show event time of IRQ 25 only
++    ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only
++    ./alisoftirqs 1 10       # print 1 second summaries, 10 times
++    ./alisoftirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+diff --git a/tools/alisyscount.py b/tools/alisyscount.py
+new file mode 100755
+index 0000000..921fac7
+--- /dev/null
++++ b/tools/alisyscount.py
+@@ -0,0 +1,329 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alisyscount   Summarize syscall counts and latencies.
++#               Enhanced version of BCC syscount.
++#               For Linux, uses BCC, eBPF.
++#
++# USAGE: alisyscount [-h] [-T] [-N] [-D] [-S] [-P] [-s syscall] [-p pid] [interval] [outputs]
++#
++# Copyright (c) 2015 Brendan Gregg.
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++from time import sleep, strftime
++import argparse
++import ctypes
++from bcc.syscall import syscall_name, syscalls
++
++# arguments
++examples = """examples:
++    ./alisyscount            # sum syscall counts and latencies.
++    ./alisyscount -S         # show per-syscall statistics
++    ./alisyscount -P         # show per-thread syscall statistics
++    ./alisyscount -D         # show syscall statistics as histograms
++    ./alisyscount -s 0       # show syscall statistics of syscall 0 only
++    ./alisyscount -p 25      # show syscall statistics of PID 25 only
++    ./alisyscount 1 10       # print 1 second summaries, 10 times
++    ./alisyscount -NT 1      # 1s summaries, nanoseconds, and timestamps
++"""
++parser = argparse.ArgumentParser(
++    description="Summarize syscall counts and latencies.",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-T", "--timestamp", action="store_true",
++    help="include timestamp on output")
++parser.add_argument("-N", "--nanoseconds", action="store_true",
++    help="output in nanoseconds")
++parser.add_argument("-S", "--syscalls", action="store_true",
++    help="show per-syscall statistics")
++parser.add_argument("-P", "--pids", action="store_true",
++    help="show per-thread syscall statistics")
++parser.add_argument("-D", "--dist", action="store_true",
++    help="show distributions as histograms")
++parser.add_argument("-s", "--syscall",
++    help="show statistics of specific syscall only")
++parser.add_argument("-p", "--pid",
++    help="show syscall statistics of specific PID only")
++parser.add_argument("interval", nargs="?", default=99999999,
++    help="output interval, in seconds")
++parser.add_argument("outputs", nargs="?", default=99999999,
++    help="number of outputs")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++countdown = int(args.outputs)
++if args.pid:
++    args.pids = True
++if args.syscall:
++    args.syscalls = True
++if (not args.syscalls) and (not args.pids):
++    args.syscalls = True
++
++if args.nanoseconds:
++    factor = 1
++    label = "nsecs"
++else:
++    factor = 1000
++    label = "usecs"
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++BPF_PERCPU_ARRAY(start, u64, 1);
++
++STORAGE
++
++TRACEPOINT_PROBE(raw_syscalls, sys_enter)
++{
++    COND_FILTER
++
++    u32 idx = 0;
++    u64 ts = bpf_ktime_get_ns();
++    start.update(&idx, &ts);
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(raw_syscalls, sys_exit)
++{
++    COND_FILTER
++
++    u32 idx = 0;
++    u64 *tsp, delta;
++    tsp = start.lookup(&idx);
++    if (!tsp || *tsp == 0) {
++        return 0;   // missed start
++    }
++
++    delta = bpf_ktime_get_ns() - *tsp;
++    u64 zero = 0;
++    start.update(&idx, &zero);
++
++    u32 sysnr = args->id;
++    u32 pid = bpf_get_current_pid_tgid();
++
++    UPDATE
++
++    return 0;
++}
++"""
++
++
++# replace STORAGE
++map_key = """
++typedef struct map_key {
++    u32 sysnr;
++    u32 pid;
++} map_key_t;
++"""
++
++map_val = """
++typedef struct map_val {
++    u64 time;
++    u64 count;
++} map_val_t;
++"""
++
++if args.dist:
++    if args.syscalls and args.pids:
++        storage = map_key + """
++typedef struct map_key_slot {
++    map_key_t key;
++    u64 slot;
++} map_key_slot_t;
++"""
++    else:
++        storage = """
++typedef struct map_key_slot {
++    u64 key;
++    u64 slot;
++} map_key_slot_t;
++"""
++    storage += "BPF_HISTOGRAM(dist, map_key_slot_t);"
++
++else:
++    storage = map_val
++    if args.syscalls and args.pids:
++        storage += map_key + "BPF_HASH(res, map_key_t, map_val_t);"
++    else:
++        storage += "BPF_HASH(res, u32, map_val_t);"
++bpf_text = bpf_text.replace('STORAGE', storage)
++
++
++# replace UPDATE
++if args.dist:
++    if args.syscalls and args.pids:
++        key_exp = "{.sysnr = sysnr, .pid = pid,}"
++    elif args.syscalls:
++        key_exp = "sysnr"
++    elif args.pids:
++        key_exp = "pid"
++
++    update = """
++    map_key_slot_t key_slot = {
++        .key = %s,
++        .slot = bpf_log2l(delta / FACTOR),
++    };
++    dist.increment(key_slot);
++""" % key_exp
++
++else:
++    if args.syscalls and args.pids:
++        update = """
++    map_key_t key = {
++        .sysnr = sysnr,
++        .pid = pid,
++    };
++"""
++    elif args.syscalls:
++        update = "u32 key = sysnr;"
++    elif args.pids:
++        update = "u32 key = pid;"
++
++    update += """
++    map_val_t val = {};
++    map_val_t *valp = res.lookup_or_init(&key, &val);
++    if (valp) {
++        valp->count += 1;
++        valp->time += delta;
++    }
++"""
++bpf_text = bpf_text.replace('UPDATE', update)
++bpf_text = bpf_text.replace('FACTOR', str(factor))
++
++
++# replace FILTER
++sysnr_match = "args->id == %s" % args.syscall
++pid_match = "((u32)bpf_get_current_pid_tgid()) == %s" % args.pid
++
++if args.syscall and args.pid:
++    cond_match = "(%s) && (%s)" % (sysnr_match, pid_match)
++elif args.syscall:
++    cond_match = sysnr_match
++elif args.pid:
++    cond_match = pid_match
++else:
++    cond_match = ""
++
++cond_filter = "if (!(%s)) {return 0;}" % cond_match if cond_match else ""
++bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
++
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++# load BPF program
++b = BPF(text=bpf_text)
++print("Tracing syscall statistics... Hit Ctrl-C to end.")
++
++if args.dist:
++    tab = b.get_table("dist")
++else:
++    tab = b.get_table("res")
++
++
++def pid_to_comm(pid):
++    try:
++        return open("/proc/%d/comm" % pid, "r").read().rstrip("\n")
++    except IOError:
++        return ""
++
++
++def get_pid_info(pid):
++    return "%10d %30s" % (pid, pid_to_comm(pid))
++
++def get_syscall_info(sysnr):
++    return "%10d %30s" % (sysnr, syscall_name(sysnr))
++
++def skip_errata(k):
++    if args.syscalls and args.pids:
++        sysnr = k.sysnr
++    else:  # args.syscalls or args.pids
++        sysnr = k.value
++
++    # happens occasionally, we don't need it
++    if sysnr == 0xFFFFFFFF:
++        return True
++    else:
++        return False
++
++
++def print_stat(tab):
++    if args.syscalls and args.pids:
++        section_hdr = "%10s %30s %10s %30s" % ("SysNR", "Syscall", "PID", "Thread")
++    elif args.pids:
++        section_hdr = "%10s %30s" % ("PID", "Thread")
++    else:  # args.syscalls
++        section_hdr = "%10s %30s" % ("SysNR", "Syscall")
++
++    section_hdr += " %10s %10s" % ("Count", "TOTAL_" + label)
++    print(section_hdr)
++
++    # k is struct map_key; v is struct map_val
++    for k, v in sorted(tab.items(), key=lambda res: (-res[1].count, -res[1].time)):
++        if skip_errata(k):
++            continue
++
++        if args.syscalls and args.pids:
++            record = "%40s %40s" % (get_syscall_info(k.sysnr), get_pid_info(k.pid))
++        elif args.pids:
++            record = "%40s" % (get_pid_info(k.value))
++        else:  # args.syscalls
++            record = "%40s" % (get_syscall_info(k.value))
++
++        record += "%10d %10d" % (v.count, v.time / factor)
++        print(record)
++
++
++def get_dist_pid_info(pid):
++    return "PID %d (%s)" % (pid, pid_to_comm(pid))
++
++def get_dist_syscall_info(sysnr):
++    return "Syscall %d (%s)" % (sysnr, syscall_name(sysnr))
++
++def print_dist_section(k):
++    if args.syscalls and args.pids:
++        return "%s %s" % (get_dist_syscall_info(k[0]), get_dist_pid_info(k[1]))
++    elif args.pids:
++        return "%40s" % (get_dist_pid_info(k))
++    else:
++        return "%40s" % (get_dist_syscall_info(k))
++
++def print_dist_stat(tab):
++    section = "section" if args.syscalls and args.pids else "Thread" if args.pids else "Syscall"
++    tab.print_log2_hist("time_" + label, section,
++        section_print_fn=print_dist_section,
++        bucket_fn=(lambda k: (k.sysnr, k.pid)) if args.syscalls and args.pids else None,
++        bucket_sort_fn=lambda k: sorted(k))
++
++
++# output
++while (1):
++    try:
++        sleep(int(args.interval))
++    except KeyboardInterrupt:
++        countdown = 1
++    print()
++    if args.timestamp:
++        print("%-8s\n" % strftime("%H:%M:%S"), end="")
++
++    if args.dist:
++        print_dist_stat(tab)
++    else:
++        print_stat(tab)
++
++    tab.clear()
++
++    countdown -= 1
++    if countdown == 0:
++        exit()
+diff --git a/tools/alisyscount_example.txt b/tools/alisyscount_example.txt
+new file mode 100644
+index 0000000..935f2ff
+--- /dev/null
++++ b/tools/alisyscount_example.txt
+@@ -0,0 +1,199 @@
++This program summarize syscall counts and latencies, in flavor of
++per-CPU, per-PID, or histogram.
++
++There are major two categories of parameters:
++1. one category of parameters control the sampling source, such as
++-s[--syscall] and -p[--pid]
++2. another category of parameters control the displaying format,
++such as -S[--syscalls], -P[--pids] and -D[--dist]
++
++These two categories of parameters are Orthogonal.
++
++
++Example 1.
++
++By default, the program will sample all syscalls of all processes.
++Statistics of all processes on the same syscall is aggregated and
++displayed in a per-syscall flavor.
++
++In this case, "alisyscount" is the same as "alisyscount -S".
++
++# alisyscount
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++     SysNR                        Syscall      Count TOTAL_usecs
++        35                      nanosleep       246   23815620
++         4                           stat       207        306
++         0                           read        61         65
++       321                            bpf        43        112
++         3                          close        28         23
++        14                 rt_sigprocmask        20          5
++       232                     epoll_wait        20          5
++       162                           sync        19    1901216
++       228                  clock_gettime        18          5
++        23                         select         9    4016487
++         6                          lstat         9         16
++         1                          write         8        136
++         2                           open         8         19
++        72                          fcntl         6          1
++         5                          fstat         4          2
++        16                          ioctl         3          3
++       271                          ppoll         1     900954
++        78                       getdents         1        355
++        89                       readlink         1          4
++       257                         openat         1          2
++         9                           mmap         1          2
++       100                          times         1          2
++        13                   rt_sigaction         1          0
++
++
++Example 2.
++
++Make the output displayed in a per-process flavor with "-P" parameter.
++In this case, all syscalls of all processe are sampled as usual, while
++all statistics of the same process is aggregated, whatever syscall it
++runs.
++
++# alisyscount -P
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++       PID                         Thread      Count TOTAL_usecs
++      3550                      AliYunDun       125    3103841
++      3529                      AliYunDun        71    3101981
++      3549                      AliYunDun        62    3103861
++      3543                      AliYunDun        61    3001874
++      3493                 aliyun-service        61    3001874
++      3564                           sshd        48    3149751
++      3824                alisyscount.py        46    3149398
++      3548                      AliYunDun        40    3002530
++      3544                      AliYunDun        32    2000233
++      3530                      AliYunDun        31    3101971
++      3556                      AliYunDun        31    3001000
++      3542                      AliYunDun        30    3001918
++      3531                      AliYunDun        30    3001871
++      3423                AliYunDunUpdate        30    3001867
++      3424                AliYunDunUpdate        30    3001853
++      3475                   in:imjournal         5    1801873
++      3551                      AliYunDun         3    1000111
++      3422                AliYunDunUpdate         3          1
++      3553                      AliYunDun         2    2000172
++      3547                      AliYunDun         2    2000172
++      3555                      AliYunDun         2    2000162
++
++
++Example 3.
++
++You can set "-P" and "-S" parameter at the same time, in which case, the
++output is displayed in a per-syscall and per-process flavor.
++
++# alisyscount -P -S
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++     SysNR                        Syscall        PID                         Thread      Count TOTAL_usecs
++         4                           stat       3550                      AliYunDun       586        855
++        35                      nanosleep       3549                      AliYunDun        40    2002500
++         0                           read       3550                      AliYunDun        36         90
++        35                      nanosleep       3548                      AliYunDun        26    1951607
++       321                            bpf       3832                alisyscount.py        23         36
++        35                      nanosleep       3550                      AliYunDun        22    1901390
++        35                      nanosleep       3530
++
++
++Example 4.
++
++Sample specified process only with "-p", and "-P" is added implicitly
++when "-p" is specified.
++
++# alisyscount -p 3550
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++       PID                         Thread      Count TOTAL_usecs
++      3550                      AliYunDun       121    3003773
++
++
++Example 5.
++
++When both "-p" and "-S" parameters are specified, the output is displayed
++in per-syscall and per-process flavor.
++
++# alisyscount -p 3550 -S
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++     SysNR                        Syscall        PID                         Thread      Count TOTAL_usecs
++         0                           read       3550                      AliYunDun        42         16
++        35                      nanosleep       3550                      AliYunDun        41    2052562
++
++
++Example 6.
++
++Sample specified syscall only with "-s", and "-S" is added
++implicitly when "-s" is specified.
++
++# alisyscount -s 3
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++     SysNR                        Syscall      Count TOTAL_usecs
++         3                          close        36         15
++
++
++Example 7.
++
++Make the output displayed in a diagram when "-D" is specified.
++
++# alisyscount -p 3550 -D
++Tracing syscall statistics... Hit Ctrl-C to end.
++^C
++
++Thread =                     PID 3550 (AliYunDun)
++     time_usecs          : count     distribution
++         0 -> 1          : 33       |****************************************|
++         2 -> 3          : 0        |                                        |
++         4 -> 7          : 0        |                                        |
++         8 -> 15         : 0        |                                        |
++        16 -> 31         : 0        |                                        |
++        32 -> 63         : 0        |                                        |
++        64 -> 127        : 0        |                                        |
++       128 -> 255        : 0        |                                        |
++       256 -> 511        : 0        |                                        |
++       512 -> 1023       : 0        |                                        |
++      1024 -> 2047       : 0        |                                        |
++      2048 -> 4095       : 0        |                                        |
++      4096 -> 8191       : 0        |                                        |
++      8192 -> 16383      : 0        |                                        |
++     16384 -> 32767      : 0        |                                        |
++     32768 -> 65535      : 32       |**************************************  |
++
++
++
++USAGE message:
++
++# alisyscount -h
++usage: alisyscount.py [-h] [-T] [-N] [-S] [-P] [-D] [-s SYSCALL] [-p PID]
++                      [interval] [outputs]
++
++Summarize syscall counts and latencies.
++
++positional arguments:
++  interval              output interval, in seconds
++  outputs               number of outputs
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -T, --timestamp       include timestamp on output
++  -N, --nanoseconds     output in nanoseconds
++  -S, --syscalls        show per-syscall statistics
++  -P, --pids            show per-thread syscall statistics
++  -D, --dist            show distributions as histograms
++  -s SYSCALL, --syscall SYSCALL
++                        show statistics of specific syscall only
++  -p PID, --pid PID     show syscall statistics of specific PID only
++
++examples:
++    ./alisyscount            # sum syscall counts and latencies.
++    ./alisyscount -S         # show per-syscall statistics
++    ./alisyscount -P         # show per-thread syscall statistics
++    ./alisyscount -D         # show syscall statistics as histograms
++    ./alisyscount -s 0       # show syscall statistics of syscall 0 only
++    ./alisyscount -p 25      # show syscall statistics of PID 25 only
++    ./alisyscount 1 10       # print 1 second summaries, 10 times
++    ./alisyscount -NT 1      # 1s summaries, nanoseconds, and timestamps
+diff --git a/tools/alisysdelay.py b/tools/alisysdelay.py
+new file mode 100755
+index 0000000..e4c7952
+--- /dev/null
++++ b/tools/alisysdelay.py
+@@ -0,0 +1,297 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alisysdelay Catch tasks hung in kernel mode and get its stack
++#             For Linux, uses BCC, BPF, perf_events. Embedded C.
++#
++# In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
++# may be unacceptable, because some tasks may run too long in kernel mode
++# without calling cond_sched(). These tasks may be kernel thread, or user
++# processes that enter kernel mode by syscall.
++#
++# alisysdelay can catch tasks that have been running continuously for a long time
++# whithout scheduling, and kernel/user stackes of these tasks will be printed.
++# The threshold can be user defined.
++#
++# By default CPU idle stacks are excluded by simply excluding PID 0.
++#
++# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
++#
++# USAGE: alisysdelay [-h] [-d] [-a] [-f frequency] [-t threshold] [--stack-storage-size size]
++#
++# Copyright (c) 2016 Brendan Gregg, Netflix, Inc.
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF, PerfType, PerfSWConfig
++import argparse
++import errno
++import ctypes as ct
++
++
++# arg validation
++def positive_int(val):
++    try:
++        ival = int(val)
++    except ValueError:
++        raise argparse.ArgumentTypeError("must be an integer")
++
++    if ival < 0:
++        raise argparse.ArgumentTypeError("must be positive")
++    return ival
++
++def positive_nonzero_int(val):
++    ival = positive_int(val)
++    if ival == 0:
++        raise argparse.ArgumentTypeError("must be nonzero")
++    return ival
++
++def stack_id_err(stack_id):
++    # -EFAULT in get_stackid normally means the stack-trace is not availible,
++    # Such as getting kernel stack trace in userspace code
++    return (stack_id < 0) and (stack_id != -errno.EFAULT)
++
++# arguments
++examples = """examples:
++    ./alisysdelay             # catch tasks hung in kernel mode
++    ./alisysdelay -f 99       # catch tasks hung in kernel mode at 99 Hertz
++    ./alisysdelay -t 100      # catch tasks that hung in kernel mode for more than 100us
++"""
++parser = argparse.ArgumentParser(
++    description="Catch tasks hung in kernel mode and get its stack",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-f", "--frequency", type=positive_nonzero_int, default=100,
++    help="sample frequency in Hertz "
++        "(default %(default)s)")
++parser.add_argument("-d", "--delimited", action="store_true",
++    help="insert delimiter between kernel/user stacks")
++parser.add_argument("-a", "--annotations", action="store_true",
++    help="add _[k] annotations to kernel frames")
++parser.add_argument("-t", "--threshold", type=positive_nonzero_int, default=50,
++    help="set the threshold of delay in kernel mode, in us "
++        "(default %(default)s)")
++parser.add_argument("--stack-storage-size", type=positive_nonzero_int, default=16384,
++    help="the number of unique stack traces that can be stored and displayed "
++        "(default %(default)s)")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++# option logic
++args = parser.parse_args()
++debug = 0
++
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++#include <uapi/linux/bpf_perf_event.h>
++#include <asm/segment.h>
++#include <asm/processor-flags.h>
++#include <linux/sched.h>
++
++typedef struct event {
++    u32 pid;
++    u64 kernel_ip;
++    int user_stack_id;
++    int kernel_stack_id;
++    char comm[TASK_COMM_LEN];
++} event_t;
++
++BPF_PERCPU_ARRAY(start, u64, 1);
++BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
++BPF_PERF_OUTPUT(events);
++
++static inline void update_start_time()
++{
++    u32 idx = 0;
++    u64 ts = bpf_ktime_get_ns();
++    start.update(&idx, &ts);
++}
++
++TRACEPOINT_PROBE(raw_syscalls, sys_enter)
++{
++    update_start_time();
++    return 0;
++}
++
++TRACEPOINT_PROBE(sched, sched_switch)
++{
++    update_start_time();
++    return 0;
++}
++
++int kprobe___cond_resched(struct pt_regs *ctx)
++{
++    update_start_time();
++    return 0;
++}
++
++// This is copy from arch/x86/asm/ptrace.h
++static inline int user_mode_dump(struct pt_regs *regs)
++{
++#ifdef CONFIG_X86_32
++    u64 cs, flags;
++    bpf_probe_read(&cs, sizeof(u64), &regs->cs);
++    bpf_probe_read(&flags, sizeof(u64), &regs->flags);
++    return ((cs & SEGMENT_RPL_MASK) | (flags & X86_VM_MASK)) >= USER_RPL;
++#else
++    u64 cs;
++    bpf_probe_read(&cs, sizeof(u64), &regs->cs);
++    return !!( cs & 3);
++#endif
++}
++
++static bool need_check_delay(struct pt_regs *regs)
++{
++    // ignore process 0
++    if (bpf_get_current_pid_tgid() == 0) {
++        return 0;
++    }
++
++    // skip when currently cpu is in user mode
++    if (user_mode_dump(regs)) {
++        return 0;
++    }
++
++    return 1;
++}
++
++static inline void get_stack_id(struct bpf_perf_event_data *ctx)
++{
++    event_t key = {.pid = bpf_get_current_pid_tgid()};
++    bpf_get_current_comm(&key.comm, sizeof(key.comm));
++
++    // get stacks
++    key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK);
++    key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0);
++
++    if (key.kernel_stack_id >= 0) {
++        // populate extras to fix the kernel stack
++        u64 ip = PT_REGS_IP(&ctx->regs);
++        u64 page_offset;
++
++        // if ip isn't sane, leave key ips as zero for later checking
++#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
++        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
++        page_offset = __PAGE_OFFSET_BASE;
++#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
++        // x64, 4.17, and later
++#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
++        page_offset = __PAGE_OFFSET_BASE_L5;
++#else
++        page_offset = __PAGE_OFFSET_BASE_L4;
++#endif
++#else
++        // earlier x86_64 kernels, e.g., 4.6, comes here
++        // arm64, s390, powerpc, x86_32
++        page_offset = PAGE_OFFSET;
++#endif
++
++        if (ip > page_offset) {
++            key.kernel_ip = ip;
++        }
++    }
++
++    events.perf_submit(ctx, &key, sizeof(event_t));
++}
++
++int sys_timer(struct bpf_perf_event_data *ctx) {
++
++    if (need_check_delay(&ctx->regs)) {
++        u32 idx = 0;
++        u64 *tsp = start.lookup(&idx);
++        if (!tsp || *tsp == 0) {
++            return 0;   // missed start
++        }
++
++        if ((bpf_ktime_get_ns() - *tsp) >= (DELAY_THRESHOLD)) {
++            get_stack_id(ctx);
++        }
++    }
++    else {
++        update_start_time();
++    }
++
++    return 0;
++}
++"""
++
++# set stack storage size
++bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
++bpf_text = bpf_text.replace('DELAY_THRESHOLD', str(args.threshold * 1000))
++
++
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++# initialize BPF & perf_events
++b = BPF(text=bpf_text)
++b.attach_perf_event(ev_type=PerfType.SOFTWARE,
++    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="sys_timer",
++    sample_freq=args.frequency, pid=-1, cpu=-1, group_fd=-1)
++
++
++def aksym(addr):
++    if args.annotations:
++        return b.ksym(addr) + "_[k]".encode()
++    else:
++        return b.ksym(addr)
++
++TASK_COMM_LEN = 16
++class Record(ct.Structure):
++    _fields_ = [("pid", ct.c_ulong),
++                ("kernel_ip", ct.c_ulonglong),
++                ("user_stack_id", ct.c_int),
++                ("kernel_stack_id", ct.c_int),
++                ("comm", ct.c_char * TASK_COMM_LEN)]
++
++
++def print_stack(cpu, data, size):
++    k = ct.cast(data, ct.POINTER(Record)).contents
++
++    user_stack = [] if k.user_stack_id < 0 else stack_traces.walk(k.user_stack_id)
++    kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id)
++
++    # fix kernel stack
++    kernel_stack = []
++    if k.kernel_stack_id >= 0:
++        for addr in kernel_tmp:
++            kernel_stack.append(addr)
++        # the later IP checking
++        if k.kernel_ip:
++            kernel_stack.insert(0, k.kernel_ip)
++
++    # print default multi-line stack output
++    if stack_id_err(k.kernel_stack_id):
++        print("    [Missed Kernel Stack]")
++    else:
++        for addr in kernel_stack:
++            print("    %s" % aksym(addr))
++    if args.delimited and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
++        print("    --")
++    if stack_id_err(k.user_stack_id):
++        print("    [Missed User Stack]")
++    else:
++        for addr in user_stack:
++            print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
++    print("    %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid))
++    print()
++
++
++# header
++print("Sampling at %s Hz frequency" % args.frequency, end="")
++print("... Hit Ctrl-C to end.")
++
++# collect samples
++b["events"].open_perf_buffer(print_stack)
++stack_traces = b["stack_traces"]
++
++while (1):
++    try:
++        b.perf_buffer_poll()
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/alisysdelay_example.txt b/tools/alisysdelay_example.txt
+new file mode 100644
+index 0000000..6f8f593
+--- /dev/null
++++ b/tools/alisysdelay_example.txt
+@@ -0,0 +1,75 @@
++alisysdelay is used to catch tasks hung in kernel mode and get its stack.
++
++In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes
++may be unacceptable, because some tasks may run too long in kernel mode
++without calling cond_sched(). These tasks may be kernel thread, or user
++processes that enter kernel mode by syscall.
++
++alisysdelay can catch tasks that have been running continuously for a long time
++whithout scheduling, and kernel/user stackes will be printed for these tasks.
++The threshold can be user defined.
++
++By default CPU idle stacks are excluded by simply excluding PID 0.
++
++
++
++For example,
++
++# alisysdelay -t 10
++Sampling at 100 Hz frequency... Hit Ctrl-C to end.
++
++    iowrite16
++    iowrite16
++    vp_notify
++    virtqueue_notify
++    start_xmit
++    dev_hard_start_xmit
++    sch_direct_xmit
++    __qdisc_run
++    __dev_queue_xmit
++    ip_finish_output2
++    ip_output
++    __tcp_transmit_skb
++    tcp_write_xmit
++    __tcp_push_pending_frames
++    tcp_sendmsg_locked
++    tcp_sendmsg
++    sock_sendmsg
++    sock_write_iter
++    __vfs_write
++    vfs_write
++    ksys_write
++    do_syscall_64
++    entry_SYSCALL_64_after_hwframe
++    __write_nocancel
++    -                sshd (4617)
++
++In this case, sshd (PID 4617) is catched for staying in kernel mode
++for more than 10us, without rescheduling.
++
++
++
++USAGE message:
++# alisysdelay -h
++usage: alisysdelay.py [-h] [-f FREQUENCY] [-d] [-a] [-t THRESHOLD]
++                   [--stack-storage-size STACK_STORAGE_SIZE]
++
++Catch tasks hung in kernel mode and get its stack
++
++optional arguments:
++  -h, --help            show this help message and exit
++  -f FREQUENCY, --frequency FREQUENCY
++                        sample frequency in Hertz (default 100)
++  -d, --delimited       insert delimiter between kernel/user stacks
++  -a, --annotations     add _[k] annotations to kernel frames
++  -t THRESHOLD, --threshold THRESHOLD
++                        set the threshold of delay in kernel mode, in us
++                        (default 50)
++  --stack-storage-size STACK_STORAGE_SIZE
++                        the number of unique stack traces that can be stored
++                        and displayed (default 16384)
++
++examples:
++    ./alisysdelay             # catch tasks hung in kernel mode
++    ./alisysdelay -f 99       # catch tasks hung in kernel mode at 99 Hertz
++    ./alisysdelay -t 100      # catch tasks that hung in kernel mode for more than 100us
+diff --git a/tools/alisyslatency.py b/tools/alisyslatency.py
+new file mode 100755
+index 0000000..05bfdbd
+--- /dev/null
++++ b/tools/alisyslatency.py
+@@ -0,0 +1,171 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# alisyslatency  Summarize time spent in kernel context.
++#                For Linux, uses BCC, eBPF.
++#
++# USAGE: alisyslatency [-h] [-T] [-N] [-c CPU] [-p PID] [interval] [outputs]
++#
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++from time import sleep, strftime
++import argparse
++
++# arguments
++examples = """examples:
++    ./alisyslatency            # sum time in kernel context per CPU
++    ./alisyslatency -c 0       # show time in kernel context in CPU 0 only
++    ./alisyslatency -p 25      # show time in kernel context in pid 25 only
++    ./alisyslatency 1 10       # print 1 second summaries, 10 times
++    ./alisyslatency -NT 1      # 1s summaries, nanoseconds, and timestamps
++"""
++
++parser = argparse.ArgumentParser(
++    description="Summarize time spent in kernel context.",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("-T", "--timestamp", action="store_true",
++    help="include timestamp on output")
++parser.add_argument("-N", "--nanoseconds", action="store_true",
++    help="output in nanoseconds")
++
++thread_group = parser.add_mutually_exclusive_group()
++thread_group.add_argument("-c", "--cpu",
++    help="output statistics on specific CPU only")
++thread_group.add_argument("-p", "--pid",
++    help="output statistics of specific pid only")
++
++parser.add_argument("interval", nargs="?", default=99999999,
++    help="output interval, in seconds")
++parser.add_argument("outputs", nargs="?", default=99999999,
++    help="number of outputs")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++countdown = int(args.outputs)
++if args.nanoseconds:
++    factor = 1
++    label = "nsecs"
++else:
++    factor = 1000
++    label = "usecs"
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++typedef struct map_val {
++    u64 count;
++    u64 time;
++    u64 time_max;
++} map_val_t;
++
++BPF_PERCPU_ARRAY(start, u64, 1);
++BPF_HASH(res, u32, map_val_t);
++
++
++TRACEPOINT_PROBE(context_tracking, user_exit)
++{
++    COND_FILTER
++
++    u32 idx = 0;
++    u64 ts = bpf_ktime_get_ns();
++    start.update(&idx, &ts);
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(context_tracking, user_enter)
++{
++    COND_FILTER
++
++    u64 *tsp, delta;
++    u32 idx = 0;
++    // fetch timestamp and calculate delta
++    tsp = start.lookup(&idx);
++    if (!tsp || *tsp == 0) {
++        return 0;   // missed start
++    }
++
++#if STATISTICS_PER_PROCESS
++    u32 key = bpf_get_current_pid_tgid();
++#else
++    u32 key = bpf_get_smp_processor_id();
++#endif
++
++    delta = bpf_ktime_get_ns() - *tsp;
++    map_val_t *valp, val;
++
++    valp = res.lookup(&key);
++    if (valp) {
++        valp->count += 1;
++        valp->time += delta;
++        if (valp->time_max < delta) {valp->time_max = delta;}
++    }
++    else {
++        val.count = 1;
++        val.time = val.time_max = delta;
++        res.update(&key, &val);
++    }
++
++    u64 zero = 0;
++    start.update(&idx, &zero);
++
++    return 0;
++}
++"""
++
++cond_exp = "1"
++if args.cpu:
++    cond_exp = "bpf_get_smp_processor_id() == %s" % args.cpu
++
++if args.pid:
++    cond_exp = "(u32)bpf_get_current_pid_tgid() == %s" % args.pid
++
++cond_filter = "if (!(%s)) {return 0;}" % cond_exp
++bpf_text = bpf_text.replace('COND_FILTER', cond_filter)
++
++bpf_text = bpf_text.replace('STATISTICS_PER_PROCESS', "1" if args.cpu else "0")
++
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++# load BPF program
++b = BPF(text=bpf_text)
++print("Tracing time in kernel context... Hit Ctrl-C to end.")
++tab = b.get_table("res")
++
++
++# output
++while (1):
++    try:
++        sleep(int(args.interval))
++    except KeyboardInterrupt:
++        countdown = 1
++    print()
++    if args.timestamp:
++        print("%-8s\n" % strftime("%H:%M:%S"), end="")
++
++    # print header
++    print("%10s %10s %10s %10s" %
++            ("PID" if args.cpu else "CPU", "Count", "TOTAL_" + label, "MAX_" + label))
++
++    for k, v in sorted(tab.items(), key=lambda res: res[0]):
++        print("%10d %10d %10d %10d" %
++            (k, v.count, v.time / factor, v.time_max / factor))
++
++    tab.clear()
++
++    countdown -= 1
++    if countdown == 0:
++        exit()
+diff --git a/tools/alisyslatency_example.txt b/tools/alisyslatency_example.txt
+new file mode 100644
+index 0000000..f5dd2c7
+--- /dev/null
++++ b/tools/alisyslatency_example.txt
+@@ -0,0 +1,72 @@
++Demonstrations of alisyslatency, the Linux eBPF/bcc version.
++
++
++This program traces time spent in kernel context (between
++context_tracking:user_exit and context_tracking:user_enter),
++and stores timing statistics in-kernel for efficiency.
++
++
++**Note**  This tool needs Context tracking subsystem, which can not
++work without support of full dynticks.
++
++
++Example 1
++
++By default, all statistics of all processes on all CPUs are sampled,
++and the output is displayed in a per-CPU flavor.
++
++# alisyslatency
++Tracing time in kernel context... Hit Ctrl-C to end.
++^C
++       CPU      Count TOTAL_usecs  MAX_usecs
++
++
++Example 2
++
++You can sample statistics on specified CPU only with "-c" option,
++and the output will be displayed in a per-process flavor.
++
++# alisyslatency -c 6
++Tracing time in kernel context... Hit Ctrl-C to end.
++^C
++       PID      Count TOTAL_usecs  MAX_usecs
++
++
++
++Exampel 3
++
++You can sample statistics of specified process only with "-p" option,
++and the output will be displayed in a per-CPU flavor.
++
++
++# alisyslatency -p 3550
++Tracing time in kernel context... Hit Ctrl-C to end.
++^C
++       CPU      Count TOTAL_usecs  MAX_usecs
++
++
++
++USAGE message:
++
++# alisyslatency -h
++usage: alisyslatency.py [-h] [-T] [-N] [-c CPU | -p PID] [interval] [outputs]
++
++Summarize time spent in kernel context.
++
++positional arguments:
++  interval           output interval, in seconds
++  outputs            number of outputs
++
++optional arguments:
++  -h, --help         show this help message and exit
++  -T, --timestamp    include timestamp on output
++  -N, --nanoseconds  output in nanoseconds
++  -c CPU, --cpu CPU  output statistics on specific CPU only
++  -p PID, --pid PID  output statistics of specific pid only
++
++examples:
++    ./alisyslatency            # sum time in kernel context per CPU
++    ./alisyslatency -c 0       # show time in kernel context in CPU 0 only
++    ./alisyslatency -p 25      # show time in kernel context in pid 25 only
++    ./alisyslatency 1 10       # print 1 second summaries, 10 times
++    ./alisyslatency -NT 1      # 1s summaries, nanoseconds, and timestamps
+diff --git a/tools/aliworkslower.py b/tools/aliworkslower.py
+new file mode 100755
+index 0000000..6c7fffe
+--- /dev/null
++++ b/tools/aliworkslower.py
+@@ -0,0 +1,137 @@
++#!/usr/bin/python
++# @lint-avoid-python-3-compatibility-imports
++#
++# aliworkslower    Trace high work sched latency.
++#                  For Linux, uses BCC, eBPF.
++#
++# USAGE: aliworkslower [-h] [threshold]
++#
++# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
++# Licensed under the Apache License, Version 2.0 (the "License")
++
++from __future__ import print_function
++from bcc import BPF
++import argparse
++import ctypes as ct
++
++# arguments
++examples = """examples:
++    ./aliworkslower            # trace work sched latency higher than 10000 us (default)
++    ./aliworkslower 1000       # trace work sched latency higher than  1000 us
++"""
++parser = argparse.ArgumentParser(
++    description="Trace high work sched latency",
++    formatter_class=argparse.RawDescriptionHelpFormatter,
++    epilog=examples)
++parser.add_argument("threshold", nargs="?", type=int, default=10000,
++    help="threshold of Q2S latency, in microsecond")
++parser.add_argument("--ebpf", action="store_true",
++    help=argparse.SUPPRESS)
++
++args = parser.parse_args()
++debug = 0
++
++# define BPF program
++bpf_text = """
++#include <uapi/linux/ptrace.h>
++
++typedef struct work_timestamp {
++    u64 time_queue_work;
++    u64 time_activate_work;
++} work_timestamp_t;
++
++typedef struct work_latency {
++    u64 pid;    // pid of worker thread
++    u64 func;   // function pointer of the work callback
++    u64 q2a;    // queue_work to activate_work
++    u64 a2s;    // activate_work to execute_start
++} work_latency_t;
++
++BPF_HASH(res, u64, work_timestamp_t);
++BPF_PERF_OUTPUT(events);
++
++TRACEPOINT_PROBE(workqueue, workqueue_queue_work)
++{
++    u64 work = (u64)args->work;
++    work_timestamp_t stamp = {
++        .time_queue_work =  bpf_ktime_get_ns(),
++    };
++    res.update(&work, &stamp);
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(workqueue, workqueue_activate_work)
++{
++    u64 work = (u64)args->work;
++    work_timestamp_t *stampp = res.lookup(&work);
++
++    if (stampp) {
++        stampp->time_activate_work = bpf_ktime_get_ns();
++    }
++
++    return 0;
++}
++
++TRACEPOINT_PROBE(workqueue, workqueue_execute_start)
++{
++    u64 work = (u64)args->work;
++    u64 now = bpf_ktime_get_ns();
++    work_timestamp_t *stampp = res.lookup(&work);
++
++    if (stampp) {
++        u64 delta = now - stampp->time_queue_work;
++        if (delta > THRESHOLD) {
++            work_latency_t latency = {
++                .pid = (u32)bpf_get_current_pid_tgid(),
++                .func = (u64)args->function,
++                .q2a = stampp->time_activate_work - stampp->time_queue_work,
++                .a2s = now - stampp->time_activate_work,
++            };
++            events.perf_submit(args, &latency, sizeof(work_latency_t));
++        }
++    }
++
++    return 0;
++}
++"""
++
++bpf_text = bpf_text.replace('THRESHOLD', "%d" % (args.threshold * 1000))
++
++# output eBPF program C code after it is replaced, used by debugging
++if debug or args.ebpf:
++    print(bpf_text)
++    if args.ebpf:
++        exit()
++
++
++class Record(ct.Structure):
++    _fields_ = [("pid", ct.c_ulong),
++                ("func", ct.c_ulonglong),
++                ("q2a", ct.c_ulonglong),
++                ("a2s", ct.c_ulonglong)]
++
++
++def print_record(cpu, data, size):
++    event = ct.cast(data, ct.POINTER(Record)).contents
++    print("%10s %20s %25d %25d %25d" % (event.pid,
++                                        b.ksym(event.func),
++                                        (event.q2a + event.a2s) / 1000,
++                                        event.q2a / 1000,
++                                        event.a2s / 1000))
++
++# load BPF program
++b = BPF(text=bpf_text)
++print("Tracing high work sched latency... Hit Ctrl-C to end.")
++print("%10s %20s %25s %25s %25s" %
++    ("Worker Thread", "Work", "Total Q2S latency (us)", "Q2A latency (us)", "A2S latency (us)"))
++
++b["events"].open_perf_buffer(print_record)
++
++
++# output
++while (1):
++    try:
++        b.perf_buffer_poll()
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/aliworkslower_example.txt b/tools/aliworkslower_example.txt
+new file mode 100644
+index 0000000..21775d0
+--- /dev/null
++++ b/tools/aliworkslower_example.txt
+@@ -0,0 +1,39 @@
++aliworkslower shows high latency of work between workqueue_queue_work
++and workqueue_execute_start.
++
++Users can specify a threshhold, and whenever the latency between
++workqueue_queue_work and workqueue_execute_start exceeds this
++threshold, related information of this work is outputed.
++
++For example,
++
++# ./aliworkslower 15
++Tracing high work sched latency... Hit Ctrl-C to end.
++Worker Thread                 Work    Total Q2S latency (us)          Q2A latency (us)          A2S latency (us)
++     19783        vmstat_update                        36                         0                        35
++     29195        vmstat_update                        16                         0                        15
++
++- column "Worker Thread": PID of the worker thread
++- column "Work": work callback
++- column "Total Q2S latency": latency between workqueue_queue_work (Q)
++and workqueue_execute_start (S), in us
++- column "Q2A latency": latency between workqueue_queue_work (Q)
++and workqueue_activate_work (A), in us
++- column "A2S latency": latency between workqueue_activate_work (A)
++and workqueue_execute_start (S), in us
++
++
++# aliworkslower --help
++usage: aliworkslower.py [-h] [threshold]
++
++Trace high work sched latency
++
++positional arguments:
++  threshold   threshold of Q2S latency, in microsecond
++
++optional arguments:
++  -h, --help  show this help message and exit
++
++examples:
++    ./aliworkslower            # trace work sched latency higher than 10000 us (default)
++    ./aliworkslower 1000       # trace work sched latency higher than  1000 us
+-- 
+2.27.0
+
diff --git a/bcc.spec b/bcc.spec
index e891758..9e5019d 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -1,3 +1,4 @@
+%define anolis_release .0.1
 # luajit is not available RHEL 8
 %bcond_with lua
 
@@ -9,7 +10,7 @@
 
 Name:           bcc
 Version:        0.16.0
-Release:        3%{?dist}
+Release:        3%{anolis_release}%{?dist}
 Summary:        BPF Compiler Collection (BCC)
 License:        ASL 2.0
 URL:            https://github.com/iovisor/bcc
@@ -22,6 +23,8 @@ Patch2:         %{name}-%{version}-sync-with-libbpf-repo.patch
 Patch3:         %{name}-%{version}-slabratetop-Add-memcg_cache_params-struct-def.patch
 Patch4:         %{name}-%{version}-tools-include-kasan-header-in-slabtoprate.patch
 
+Patch1000:      1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
+
 # Arches will be included as upstream support is added and dependencies are
 # satisfied in the respective arches
 ExcludeArch: i686
@@ -218,6 +221,9 @@ done
 
 
 %changelog
+* Fri Nov 19 2021 Chunmei Xu <xuchunmei@linux.alibaba.com> - 0.16.0-3.0.1
+- add ali enhanced tools for fs schedule and sys
+
 * Mon Feb 22 2021 Jerome Marchand <jmarchan@redhat.com> - 0.16.0-3
 - Fix slabratetop header issues
 - Rebuild with latest libbpf
-- 
Gitee


From 41260d480f8fdffb8a3ea1e6bfe9592fecf1b79e Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Thu, 25 Nov 2021 17:36:59 +0800
Subject: [PATCH 2/5] fix testcases:
 test_array/test_uprobes/test_clang/test_histogram

Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
---
 ....py-add-clock_nanosleep-to-attach-po.patch | 38 +++++++
 ...olis-fix-some-test-cases-failed-3235.patch | 98 +++++++++++++++++++
 ...ram.py-fix-test-failed-on-kernel-5.1.patch | 36 +++++++
 ...-jump-table-optimization-during-jit-.patch | 38 +++++++
 bcc.spec                                      |  5 +
 5 files changed, 215 insertions(+)
 create mode 100644 1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attach-po.patch
 create mode 100644 1002-bcc-anolis-fix-some-test-cases-failed-3235.patch
 create mode 100644 1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch
 create mode 100644 1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch

diff --git a/1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attach-po.patch b/1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attach-po.patch
new file mode 100644
index 0000000..370b284
--- /dev/null
+++ b/1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attach-po.patch
@@ -0,0 +1,38 @@
+From 21ee51c8df8c569f349d9ca22328b19f2b1fd19b Mon Sep 17 00:00:00 2001
+From: Chunmei Xu <xuchunmei@linux.alibaba.com>
+Date: Thu, 24 Dec 2020 09:20:08 +0800
+Subject: [PATCH] tests/test_array.py: add clock_nanosleep to attach point
+
+since glibc-2.31, the syscall of sleep is clock_nanosleep instead of
+nanosleep
+
+Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
+---
+ tests/python/test_array.py | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/tests/python/test_array.py b/tests/python/test_array.py
+index d5e1aee0..1461226d 100755
+--- a/tests/python/test_array.py
++++ b/tests/python/test_array.py
+@@ -65,6 +65,8 @@ int do_sys_nanosleep(void *ctx) {
+         b = BPF(text=text)
+         b.attach_kprobe(event=b.get_syscall_fnname("nanosleep"),
+                         fn_name="do_sys_nanosleep")
++        b.attach_kprobe(event=b.get_syscall_fnname("clock_nanosleep"),
++                        fn_name="do_sys_nanosleep")
+         b["events"].open_perf_buffer(cb, lost_cb=lost_cb)
+         subprocess.call(['sleep', '0.1'])
+         b.perf_buffer_poll()
+@@ -98,6 +100,8 @@ int do_sys_nanosleep(void *ctx) {
+         b = BPF(text=text)
+         b.attach_kprobe(event=b.get_syscall_fnname("nanosleep"),
+                         fn_name="do_sys_nanosleep")
++        b.attach_kprobe(event=b.get_syscall_fnname("clock_nanosleep"),
++                        fn_name="do_sys_nanosleep")
+         b["events"].open_perf_buffer(cb, lost_cb=lost_cb)
+         online_cpus = get_online_cpus()
+         for cpu in online_cpus:
+-- 
+2.19.1.6.gb485710b
+
diff --git a/1002-bcc-anolis-fix-some-test-cases-failed-3235.patch b/1002-bcc-anolis-fix-some-test-cases-failed-3235.patch
new file mode 100644
index 0000000..572d0a6
--- /dev/null
+++ b/1002-bcc-anolis-fix-some-test-cases-failed-3235.patch
@@ -0,0 +1,98 @@
+From f02c87c92aa3a4938079716b0f011c111446e9ce Mon Sep 17 00:00:00 2001
+From: xuchunmei000 <59222308+xuchunmei000@users.noreply.github.com>
+Date: Thu, 21 Jan 2021 23:22:23 +0800
+Subject: [PATCH] fix some test cases failed (#3235)
+
+* tests/test_uprobes.py: set larger sleep time to pass test on aarch64
+* test/test_clang.py: attach to vfs_read after failed to attach __vfs_read
+* tests/test_clang_complex.c: set BPF_TABLE array key type to int
+  array_map_check_btf in kernel/bpf/arraymap.c check key type must be
+  BTF_KIND_INT
+
+Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
+---
+ tests/python/test_clang.py        | 6 +++++-
+ tests/python/test_clang_complex.c | 9 +++------
+ tests/python/test_uprobes.py      | 4 ++--
+ 3 files changed, 10 insertions(+), 9 deletions(-)
+
+diff --git a/tests/python/test_clang.py b/tests/python/test_clang.py
+index 1006bee4..b1fb7e96 100755
+--- a/tests/python/test_clang.py
++++ b/tests/python/test_clang.py
+@@ -852,7 +852,11 @@ int trace_read_entry(struct pt_regs *ctx, struct file *file) {
+ }
+         """
+         b = BPF(text=text)
+-        b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
++        try:
++            b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
++        except Exception:
++            print('Current kernel does not have __vfs_read, try vfs_read instead')
++            b.attach_kprobe(event="vfs_read", fn_name="trace_read_entry")
+ 
+     def test_printk_f(self):
+         text = """
+diff --git a/tests/python/test_clang_complex.c b/tests/python/test_clang_complex.c
+index 8cb9d7c9..b0397e8d 100644
+--- a/tests/python/test_clang_complex.c
++++ b/tests/python/test_clang_complex.c
+@@ -13,14 +13,11 @@ struct FwdLeaf {
+ BPF_HASH(fwd_map, struct FwdKey, struct FwdLeaf, 1);
+ 
+ // array
+-struct ConfigKey {
+-  u32 index;
+-};
+ struct ConfigLeaf {
+   u32 bpfdev_ip;
+   u32 slave_ip;
+ };
+-BPF_TABLE("array", struct ConfigKey, struct ConfigLeaf, config_map, 1);
++BPF_TABLE("array", u32, struct ConfigLeaf, config_map, 1);
+ 
+ // hash
+ struct MacaddrKey {
+@@ -49,7 +46,7 @@ int handle_packet(struct __sk_buff *skb) {
+     // make sure configured
+     u32 slave_ip;
+ 
+-    struct ConfigKey cfg_key = {.index = 0};
++    u32 cfg_key = 0;
+     struct ConfigLeaf *cfg_leaf = config_map.lookup(&cfg_key);
+     if (cfg_leaf) {
+       slave_ip = cfg_leaf->slave_ip;
+@@ -132,7 +129,7 @@ int handle_packet(struct __sk_buff *skb) {
+       u64 src_mac;
+       u64 dst_mac;
+ 
+-      struct ConfigKey cfg_key = {.index = 0};
++      u32 cfg_key = 0;
+       struct ConfigLeaf *cfg_leaf = config_map.lookup(&cfg_key);
+       if (cfg_leaf) {
+         struct MacaddrKey mac_key = {.ip = cfg_leaf->bpfdev_ip};
+diff --git a/tests/python/test_uprobes.py b/tests/python/test_uprobes.py
+index 62a370fa..f7c78e75 100755
+--- a/tests/python/test_uprobes.py
++++ b/tests/python/test_uprobes.py
+@@ -119,7 +119,7 @@ int count(struct pt_regs *ctx) {
+             shutil.copy(libz_path, b"/tmp")
+ 
+             libz = ctypes.CDLL("/tmp/libz.so.1")
+-            time.sleep(1)
++            time.sleep(3)
+             libz.zlibVersion()
+             time.sleep(5)
+             os._exit(0)
+@@ -130,7 +130,7 @@ int count(struct pt_regs *ctx) {
+         b = bcc.BPF(text=text)
+         b.attach_uprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
+         b.attach_uretprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
+-        time.sleep(1)
++        time.sleep(5)
+         self.assertEqual(b["stats"][ctypes.c_int(0)].value, 2)
+         b.detach_uretprobe(name=libname, sym=symname, pid=child_pid)
+         b.detach_uprobe(name=libname, sym=symname, pid=child_pid)
+-- 
+2.19.1.6.gb485710b
+
diff --git a/1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch b/1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch
new file mode 100644
index 0000000..e913d2c
--- /dev/null
+++ b/1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch
@@ -0,0 +1,36 @@
+From 97cded04a9d6370ac722c6ad8e73b72c4794e851 Mon Sep 17 00:00:00 2001
+From: Chunmei Xu <xuchunmei@linux.alibaba.com>
+Date: Fri, 15 Jan 2021 09:51:27 +0800
+Subject: [PATCH] test/test_histogram.py: fix test failed on kernel-5.10
+
+kernel commit(cf25e24db61cc) rename tsk->real_start_time to
+start_boottime, so test_hostogram will get failed on kernel>=5.5
+
+Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
+---
+ tests/python/test_histogram.py | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/tests/python/test_histogram.py b/tests/python/test_histogram.py
+index 2fb8c168..ec7950c9 100755
+--- a/tests/python/test_histogram.py
++++ b/tests/python/test_histogram.py
+@@ -59,10 +59,15 @@ int kprobe__htab_map_delete_elem(struct pt_regs *ctx, struct bpf_map *map, u64 *
+         b = BPF(text="""
+ #include <uapi/linux/ptrace.h>
+ #include <linux/sched.h>
++#include <linux/version.h>
+ typedef struct { char name[TASK_COMM_LEN]; u64 slot; } Key;
+ BPF_HISTOGRAM(hist1, Key, 1024);
+ int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,5,0)
+     Key k = {.slot = bpf_log2l(prev->real_start_time)};
++#else
++    Key k = {.slot = bpf_log2l(prev->start_boottime)};
++#endif
+     if (!bpf_get_current_comm(&k.name, sizeof(k.name)))
+         hist1.increment(k);
+     return 0;
+-- 
+2.19.1.6.gb485710b
+
diff --git a/1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch b/1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch
new file mode 100644
index 0000000..518feba
--- /dev/null
+++ b/1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch
@@ -0,0 +1,38 @@
+From 3318c261a9e6ef82484fc0f5f5254599b191b289 Mon Sep 17 00:00:00 2001
+From: Chunmei Xu <xuchunmei@linux.alibaba.com>
+Date: Tue, 26 Jan 2021 18:04:24 +0800
+Subject: [PATCH] aarch64: turn off jump table optimization during jit
+ compilation
+
+test_clang.py: test_jump_table will get failed with error messgae:
+bpf: Failed to load program: Invalid argument
+unknown opcode a0
+processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0
+peak_states 0 mark_read 0
+
+HINT: The 'unknown opcode' can happen if you reference a global or
+static variable, or data in read-only section. For example, 'char *p =
+"hello"' will result in p referencing a read-only section, and 'char p[]
+= "hello"' will have "hello" stored on the stack.
+
+Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
+---
+ src/cc/frontends/clang/loader.cc | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc
+index 1f98cf76..a274f0fe 100644
+--- a/src/cc/frontends/clang/loader.cc
++++ b/src/cc/frontends/clang/loader.cc
+@@ -318,7 +318,7 @@ int ClangLoader::do_compile(unique_ptr<llvm::Module> *mod, TableStorage &ts,
+   driver::Driver drv("", target_triple, diags);
+ 
+ #if LLVM_MAJOR_VERSION >= 4
+-  if (target_triple == "x86_64-unknown-linux-gnu")
++  if (target_triple == "x86_64-unknown-linux-gnu" || target_triple == "aarch64-unknown-linux-gnu")
+     flags_cstr.push_back("-fno-jump-tables");
+ #endif
+ 
+-- 
+2.19.1.6.gb485710b
+
diff --git a/bcc.spec b/bcc.spec
index 9e5019d..2ad1de7 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -24,6 +24,10 @@ Patch3:         %{name}-%{version}-slabratetop-Add-memcg_cache_params-struct-def
 Patch4:         %{name}-%{version}-tools-include-kasan-header-in-slabtoprate.patch
 
 Patch1000:      1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch
+Patch1001:      1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attach-po.patch
+Patch1002:      1002-bcc-anolis-fix-some-test-cases-failed-3235.patch
+Patch1003:      1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch
+Patch1004:      1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch
 
 # Arches will be included as upstream support is added and dependencies are
 # satisfied in the respective arches
@@ -223,6 +227,7 @@ done
 %changelog
 * Fri Nov 19 2021 Chunmei Xu <xuchunmei@linux.alibaba.com> - 0.16.0-3.0.1
 - add ali enhanced tools for fs schedule and sys
+- fix testcases: test_array/test_uprobes/test_clang/test_histogram
 
 * Mon Feb 22 2021 Jerome Marchand <jmarchan@redhat.com> - 0.16.0-3
 - Fix slabratetop header issues
-- 
Gitee


From e84926bdd157b10ca960fff98453071914cd6216 Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Thu, 25 Nov 2021 17:38:40 +0800
Subject: [PATCH 3/5] support tools/netqtop

Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
---
 1005-bcc-anolis-Netqtop-3037-3048.patch       | 637 ++++++++++++++++++
 ...python3-compatible-issue-for-netqtop.patch |  75 +++
 ...-fix-netqtop-python3-compatible-3140.patch | 113 ++++
 ...anolis-tools-netqtop-fix-rx-queue-id.patch |  59 ++
 bcc.spec                                      |   5 +
 5 files changed, 889 insertions(+)
 create mode 100644 1005-bcc-anolis-Netqtop-3037-3048.patch
 create mode 100644 1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch
 create mode 100644 1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch
 create mode 100644 1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch

diff --git a/1005-bcc-anolis-Netqtop-3037-3048.patch b/1005-bcc-anolis-Netqtop-3037-3048.patch
new file mode 100644
index 0000000..2eb8dfe
--- /dev/null
+++ b/1005-bcc-anolis-Netqtop-3037-3048.patch
@@ -0,0 +1,637 @@
+From ba73657cb8c4dab83dfb89eed4a8b3866255569a Mon Sep 17 00:00:00 2001
+From: Hao <1075808668@qq.com>
+Date: Sat, 12 Sep 2020 02:05:29 +0800
+Subject: [PATCH] Netqtop 3037 (#3048)
+
+The tool netqtop uses tracepoints NET_DEV_START_XMIT and NETIF_RECEIVE_SKB to intercept every transmitted and received packet, as a result, considerable performance descent is expected.
+Details for some performance evaluation can be found at
+   https://github.com/iovisor/bcc/pull/3048
+---
+ README.md                 |   1 +
+ man/man8/netqtop.8        |  56 ++++++++++
+ tools/netqtop.c           | 113 ++++++++++++++++++++
+ tools/netqtop.py          | 218 ++++++++++++++++++++++++++++++++++++++
+ tools/netqtop_example.txt | 190 +++++++++++++++++++++++++++++++++
+ 5 files changed, 578 insertions(+)
+ create mode 100644 man/man8/netqtop.8
+ create mode 100644 tools/netqtop.c
+ create mode 100755 tools/netqtop.py
+ create mode 100644 tools/netqtop_example.txt
+
+diff --git a/README.md b/README.md
+index 58340622..d54e3848 100644
+--- a/README.md
++++ b/README.md
+@@ -128,6 +128,7 @@ pair of .c and .py files, and some are directories of files.
+ - tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_example.txt).
+ - tools/[mountsnoop](tools/mountsnoop.py): Trace mount and umount syscalls system-wide. [Examples](tools/mountsnoop_example.txt).
+ - tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt).
++- tools/[netqtop](tools/netqtop.py) tools/[netqtop.c](tools/netqtop.c): Trace and display packets distribution on NIC queues. [Examples](tools/netqtop_example.txt).
+ - tools/[nfsslower](tools/nfsslower.py): Trace slow NFS operations. [Examples](tools/nfsslower_example.txt).
+ - tools/[nfsdist](tools/nfsdist.py): Summarize NFS operation latency distribution as a histogram. [Examples](tools/nfsdist_example.txt).
+ - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt).
+diff --git a/man/man8/netqtop.8 b/man/man8/netqtop.8
+new file mode 100644
+index 00000000..bfa34d11
+--- /dev/null
++++ b/man/man8/netqtop.8
+@@ -0,0 +1,56 @@
++.TH netqtop 8 "2020-07-30" "USER COMMANDS"
++.SH NAME
++netqtop \- Summarize PPS, BPS, average size of packets and packet counts ordered by packet sizes 
++on each queue of a network interface.
++.SH SYNOPSIS
++.B netqtop [\-n nic] [\-i interval] [\-t throughput]
++.SH DESCRIPTION
++netqtop accounts statistics of both transmitted and received packets on each queue of 
++a specified network interface to help developers check if its traffic load is balanced. 
++The result is displayed as a table with columns of PPS, BPS, average size and 
++packet counts in range [0,64), [64, 5120), [512, 2048), [2048, 16K), [16K, 64K). 
++This is printed every given interval (default 1) in seconds.
++
++The tool uses the net:net_dev_start_xmit and net:netif_receive_skb kernel tracepoints. 
++Since it uses tracepoint, the tool only works on Linux 4.7+.
++
++netqtop introduces significant overhead while network traffic is large. See OVERHEAD 
++section below.
++
++.SH REQUIREMENTS
++CONFIG_bpf and bcc
++.SH OPTIONS
++.TP
++\-n NIC
++Specify the network interface card
++.TP
++\-i INTERVAL
++Print results every INTERVAL seconds.
++The default value is 1.
++.TP
++\-t THROUGHPUT
++Print BPS and PPS of each queue.
++.SH EXAMPLES
++.TP
++Account statistics of eth0 and output every 2 seconds:
++#
++.B netqtop -n eth0 -i 1
++.SH OVERHEAD
++In performance test, netqtop introduces a overhead up to 30% PPS drop 
++while printing interval is set to 1 second. So be mindful of potential packet drop 
++when using this tool.
++
++It also increases ping-pong latency by about 1 usec. 
++.SH SOURCE
++This is from bcc
++.IP
++https://github.com/iovisor/bcc
++.PP
++Also look in the bcc distribution for a netqtop_example.txt file containing
++example usage, output and commentary for this tool.
++.SH OS 
++Linux
++.SH STABILITY
++Unstable - in development
++.SH AUTHOR
++Yolandajn
+diff --git a/tools/netqtop.c b/tools/netqtop.c
+new file mode 100644
+index 00000000..52605dda
+--- /dev/null
++++ b/tools/netqtop.c
+@@ -0,0 +1,113 @@
++
++#include <linux/netdevice.h>
++#include <linux/ethtool.h>
++#if IFNAMSIZ != 16 
++#error "IFNAMSIZ != 16 is not supported"
++#endif
++#define MAX_QUEUE_NUM 1024
++
++/**
++* This union is use to store name of the specified interface
++* and read it as two different data types
++*/
++union name_buf{
++    char name[IFNAMSIZ];
++    struct {
++        u64 hi;
++        u64 lo;
++    }name_int;
++};
++
++/* data retrieved in tracepoints */
++struct queue_data{
++    u64 total_pkt_len;
++    u32 num_pkt;
++    u32 size_64B;
++    u32 size_512B;
++    u32 size_2K;
++    u32 size_16K;
++    u32 size_64K;
++};
++
++/* array of length 1 for device name */
++BPF_ARRAY(name_map, union name_buf, 1);
++/* table for transmit & receive packets */
++BPF_HASH(tx_q, u16, struct queue_data, MAX_QUEUE_NUM);
++BPF_HASH(rx_q, u16, struct queue_data, MAX_QUEUE_NUM);
++
++static inline int name_filter(struct sk_buff* skb){
++    /* get device name from skb */
++    union name_buf real_devname;
++    struct net_device *dev;
++    bpf_probe_read(&dev, sizeof(skb->dev), ((char *)skb + offsetof(struct sk_buff, dev)));
++    bpf_probe_read(&real_devname, IFNAMSIZ, dev->name);
++
++    int key=0;
++    union name_buf *leaf = name_map.lookup(&key);
++    if(!leaf){
++        return 0;
++    }
++    if((leaf->name_int).hi != real_devname.name_int.hi || (leaf->name_int).lo != real_devname.name_int.lo){
++        return 0;
++    }
++
++    return 1;
++}
++
++static void updata_data(struct queue_data *data, u64 len){
++    data->total_pkt_len += len;
++    data->num_pkt ++;
++    if(len / 64 == 0){
++        data->size_64B ++;
++    }
++    else if(len / 512 == 0){
++        data->size_512B ++;
++    }
++    else if(len / 2048 == 0){
++        data->size_2K ++;
++    }
++    else if(len / 16384 == 0){
++        data->size_16K ++;
++    }
++    else if(len / 65536 == 0){
++        data->size_64K ++;
++    }
++}
++
++TRACEPOINT_PROBE(net, net_dev_start_xmit){
++    /* read device name */
++    struct sk_buff* skb = (struct sk_buff*)args->skbaddr;
++    if(!name_filter(skb)){
++        return 0;
++    }
++
++    /* update table */
++    u16 qid = skb->queue_mapping;
++    struct queue_data newdata;
++    __builtin_memset(&newdata, 0, sizeof(newdata));
++    struct queue_data *data = tx_q.lookup_or_try_init(&qid, &newdata);
++    if(!data){
++        return 0;
++    }
++    updata_data(data, skb->len);
++    
++    return 0;
++}
++
++TRACEPOINT_PROBE(net, netif_receive_skb){
++    struct sk_buff* skb = (struct sk_buff*)args->skbaddr;
++    if(!name_filter(skb)){
++        return 0;
++    }
++
++    u16 qid = skb->queue_mapping;
++    struct queue_data newdata;
++    __builtin_memset(&newdata, 0, sizeof(newdata));
++    struct queue_data *data = rx_q.lookup_or_try_init(&qid, &newdata);
++    if(!data){
++        return 0;
++    }
++    updata_data(data, skb->len);
++    
++    return 0;
++}
+diff --git a/tools/netqtop.py b/tools/netqtop.py
+new file mode 100755
+index 00000000..e2823ac6
+--- /dev/null
++++ b/tools/netqtop.py
+@@ -0,0 +1,218 @@
++#!/usr/bin/python
++
++from bcc import BPF
++from ctypes import *
++import argparse
++import os
++from time import sleep,time,localtime,asctime
++import types
++
++# pre defines -------------------------------
++ROOT_PATH = "/sys/class/net"
++IFNAMSIZ = 16
++COL_WIDTH = 10
++MAX_QUEUE_NUM = 1024
++EBPF_FILE = "netqtop.c"
++
++# structure for network interface name array
++class Devname(Structure):
++    _fields_=[
++        ('name', c_char*IFNAMSIZ)
++    ]
++
++################## printer for results ###################
++def to_str(num):
++    s = ""
++    if num > 1000000:
++        return str(round(num/(1024*1024.0), 2)) + 'M'
++    elif num > 1000:
++        return str(round(num/1024.0, 2)) + 'K'
++    else:
++        if type(num) == types.FloatType:
++            return str(round(num, 2))
++        else:
++            return str(num)
++
++def print_table(table, qnum):
++    global print_interval
++
++    # ---- print headers ----------------
++    headers = [
++		"QueueID", 
++		"avg_size", 
++		"[0, 64)", 
++		"[64, 512)", 
++		"[512, 2K)", 
++		"[2K, 16K)",
++		"[16K, 64K)"
++	]
++    if args.throughput:
++        headers.append("BPS")
++        headers.append("PPS")
++
++    for hd in headers:
++		print(hd.center(COL_WIDTH)),
++    print
++
++    # ------- calculates --------------
++    qids=[]
++    tBPS = 0
++    tPPS = 0
++    tAVG = 0
++    tGroup = [0,0,0,0,0]
++    tpkt = 0
++    tlen = 0
++    for k, v in table.items():
++        qids += [k.value]
++        tlen += v.total_pkt_len
++        tpkt += v.num_pkt
++        tGroup[0] += v.size_64B
++        tGroup[1] += v.size_512B
++        tGroup[2] += v.size_2K
++        tGroup[3] += v.size_16K
++        tGroup[4] += v.size_64K
++    tBPS = tlen / print_interval
++    tPPS = tpkt / print_interval
++    if tpkt != 0:
++        tAVG = tlen / tpkt
++
++    # -------- print table --------------
++    for k in range(qnum):
++        if k in qids:
++            item = table[c_ushort(k)]
++            data = [
++                k,
++                item.total_pkt_len,
++                item.num_pkt,
++                item.size_64B,
++                item.size_512B,
++                item.size_2K,
++                item.size_16K,
++                item.size_64K
++            ]
++        else:
++            data = [k,0,0,0,0,0,0,0]
++        
++        # print a line per queue
++        avg = 0
++        if data[2] != 0:
++            avg = data[1] / data[2]
++        print("%5d %11s %10s %10s %10s %10s %10s" % (
++            data[0],
++            to_str(avg),
++            to_str(data[3]),
++            to_str(data[4]),
++            to_str(data[5]),
++            to_str(data[6]),
++            to_str(data[7])
++        )),
++        if args.throughput:
++            BPS = data[1] / print_interval
++            PPS = data[2] / print_interval
++            print("%10s %10s" % (
++                to_str(BPS),
++                to_str(PPS)
++            ))
++        else:
++            print
++    
++    # ------- print total --------------
++    print(" Total %10s %10s %10s %10s %10s %10s" % (
++        to_str(tAVG),
++        to_str(tGroup[0]),
++        to_str(tGroup[1]),
++        to_str(tGroup[2]),
++        to_str(tGroup[3]),
++        to_str(tGroup[4])
++    )),
++
++    if args.throughput:
++        print("%10s %10s" % (
++            to_str(tBPS),
++            to_str(tPPS)
++        ))
++    else:
++        print
++
++
++def print_result(b):
++    # --------- print tx queues ---------------
++    print(asctime(localtime(time())))
++    print("TX")
++    table = b['tx_q']
++    print_table(table, tx_num)
++    b['tx_q'].clear()
++
++    # --------- print rx queues ---------------
++    print("")
++    print("RX")
++    table = b['rx_q']
++    print_table(table, rx_num)
++    b['rx_q'].clear()
++    if args.throughput:
++        print("-"*95)
++    else:
++        print("-"*76)
++
++############## specify network interface #################
++parser = argparse.ArgumentParser(description="")
++parser.add_argument("--name", "-n", type=str, default="")
++parser.add_argument("--interval", "-i", type=float, default=1)
++parser.add_argument("--throughput", "-t", action="store_true")
++parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS)
++args = parser.parse_args()
++
++if args.ebpf:
++    with open(EBPF_FILE) as fileobj:
++        progtxt = fileobj.read()
++        print(progtxt)
++    exit()
++
++if args.name == "":
++	print ("Please specify a network interface.")
++	exit()
++else:
++	dev_name = args.name
++
++if len(dev_name) > IFNAMSIZ-1:
++    print ("NIC name too long")
++    exit()
++
++print_interval = args.interval + 0.0
++if print_interval == 0:
++    print "print interval must be non-zero"
++    exit()
++
++################ get number of queues #####################
++tx_num = 0
++rx_num = 0
++path = ROOT_PATH + "/" + dev_name + "/queues"
++if not os.path.exists(path):
++	print "Net interface", dev_name, "does not exits."
++	exit()
++
++list = os.listdir(path)
++for s in list:
++    if s[0] == 'r':
++        rx_num += 1
++    if s[0] == 't':
++        tx_num += 1
++
++if tx_num > MAX_QUEUE_NUM or rx_num > MAX_QUEUE_NUM:
++    print "number of queues over 1024 is not supported."
++    exit()
++
++################## start tracing ##################
++b = BPF(src_file = EBPF_FILE)
++# --------- set hash array --------
++devname_map = b['name_map']
++_name = Devname()
++_name.name = dev_name
++devname_map[0] = _name
++
++while 1:
++    try:
++        sleep(print_interval)
++        print_result(b)
++    except KeyboardInterrupt:
++        exit()
+diff --git a/tools/netqtop_example.txt b/tools/netqtop_example.txt
+new file mode 100644
+index 00000000..443cfb71
+--- /dev/null
++++ b/tools/netqtop_example.txt
+@@ -0,0 +1,190 @@
++Demonstrations of netqtop.
++
++
++netqtop traces the kernel functions performing packet transmit (xmit_one) 
++and packet receive (__netif_receive_skb_core) on data link layer. The tool 
++not only traces every packet via a specified network interface, but also accounts 
++the PPS, BPS and average size of packets as well as packet amounts (categorized by 
++size range) on sending and receiving direction respectively. Results are printed 
++as tables, which can be used to understand traffic load allocation on each queue 
++of interested network interface to see if it is balanced. And the overall performance 
++is provided in the buttom.
++
++For example, suppose you want to know current traffic on lo, and print result 
++every second:
++# ./netqtop.py -n lo -i 1
++Thu Sep 10 11:28:39 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          88          0          9          0          0          0
++ Total         88          0          9          0          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          74          4          5          0          0          0
++ Total         74          4          5          0          0          0
++----------------------------------------------------------------------------
++Thu Sep 10 11:28:40 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         233          0          3          1          0          0
++ Total        233          0          3          1          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         219          2          1          1          0          0
++ Total        219          2          1          1          0          0
++----------------------------------------------------------------------------
++
++or you can just use the default mode
++# ./netqtop.py -n lo
++Thu Sep 10 11:27:45 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          92          0          7          0          0          0
++ Total         92          0          7          0          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          78          3          4          0          0          0
++ Total         78          3          4          0          0          0
++----------------------------------------------------------------------------
++Thu Sep 10 11:27:46 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         179          0          5          1          0          0
++ Total        179          0          5          1          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         165          3          2          1          0          0
++ Total        165          3          2          1          0          0
++----------------------------------------------------------------------------
++
++This NIC only has 1 queue.
++If you want the tool to print results after a longer interval, specify seconds with -i:
++# ./netqtop.py -n lo -i 3
++Thu Sep 10 11:31:26 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          85          0         11          0          0          0
++ Total         85          0         11          0          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0          71          5          6          0          0          0
++ Total         71          5          6          0          0          0
++----------------------------------------------------------------------------
++Thu Sep 10 11:31:29 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         153          0          7          1          0          0
++ Total        153          0          7          1          0          0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)
++    0         139          4          3          1          0          0
++ Total        139          4          3          1          0          0
++----------------------------------------------------------------------------
++
++To see PPS and BPS of each queue, use -t:
++# ./netqtop.py -n lo -i 1 -t
++Thu Sep 10 11:37:02 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0         114          0         10          0          0          0      1.11K       10.0
++ Total        114          0         10          0          0          0      1.11K       10.0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0         100          4          6          0          0          0     1000.0       10.0
++ Total        100          4          6          0          0          0     1000.0       10.0
++-----------------------------------------------------------------------------------------------
++Thu Sep 10 11:37:03 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0         271          0          3          1          0          0      1.06K        4.0
++ Total        271          0          3          1          0          0      1.06K        4.0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0         257          2          1          1          0          0       1.0K        4.0
++ Total        257          2          1          1          0          0       1.0K        4.0
++-----------------------------------------------------------------------------------------------
++
++When filtering multi-queue NICs, you do not need to specify the number of queues, 
++the tool calculates it for you:
++# ./netqtop.py -n eth0 -t
++Thu Sep 10 11:39:21 2020
++TX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0           0          0          0          0          0          0        0.0        0.0
++    1           0          0          0          0          0          0        0.0        0.0
++    2           0          0          0          0          0          0        0.0        0.0
++    3           0          0          0          0          0          0        0.0        0.0
++    4           0          0          0          0          0          0        0.0        0.0
++    5           0          0          0          0          0          0        0.0        0.0
++    6           0          0          0          0          0          0        0.0        0.0
++    7           0          0          0          0          0          0        0.0        0.0
++    8          54          2          0          0          0          0      108.0        2.0
++    9         161          0          9          0          0          0      1.42K        9.0
++   10           0          0          0          0          0          0        0.0        0.0
++   11           0          0          0          0          0          0        0.0        0.0
++   12           0          0          0          0          0          0        0.0        0.0
++   13           0          0          0          0          0          0        0.0        0.0
++   14           0          0          0          0          0          0        0.0        0.0
++   15           0          0          0          0          0          0        0.0        0.0
++   16           0          0          0          0          0          0        0.0        0.0
++   17           0          0          0          0          0          0        0.0        0.0
++   18           0          0          0          0          0          0        0.0        0.0
++   19           0          0          0          0          0          0        0.0        0.0
++   20           0          0          0          0          0          0        0.0        0.0
++   21           0          0          0          0          0          0        0.0        0.0
++   22           0          0          0          0          0          0        0.0        0.0
++   23           0          0          0          0          0          0        0.0        0.0
++   24           0          0          0          0          0          0        0.0        0.0
++   25           0          0          0          0          0          0        0.0        0.0
++   26           0          0          0          0          0          0        0.0        0.0
++   27           0          0          0          0          0          0        0.0        0.0
++   28           0          0          0          0          0          0        0.0        0.0
++   29           0          0          0          0          0          0        0.0        0.0
++   30           0          0          0          0          0          0        0.0        0.0
++   31           0          0          0          0          0          0        0.0        0.0
++ Total        141          2          9          0          0          0      1.52K       11.0
++
++RX
++ QueueID    avg_size   [0, 64)   [64, 512)  [512, 2K)  [2K, 16K)  [16K, 64K)    BPS        PPS    
++    0         127          3          9          0          0          0       1.5K       12.0
++    1           0          0          0          0          0          0        0.0        0.0
++    2           0          0          0          0          0          0        0.0        0.0
++    3           0          0          0          0          0          0        0.0        0.0
++    4           0          0          0          0          0          0        0.0        0.0
++    5           0          0          0          0          0          0        0.0        0.0
++    6           0          0          0          0          0          0        0.0        0.0
++    7           0          0          0          0          0          0        0.0        0.0
++    8           0          0          0          0          0          0        0.0        0.0
++    9           0          0          0          0          0          0        0.0        0.0
++   10           0          0          0          0          0          0        0.0        0.0
++   11           0          0          0          0          0          0        0.0        0.0
++   12           0          0          0          0          0          0        0.0        0.0
++   13           0          0          0          0          0          0        0.0        0.0
++   14           0          0          0          0          0          0        0.0        0.0
++   15           0          0          0          0          0          0        0.0        0.0
++   16           0          0          0          0          0          0        0.0        0.0
++   17           0          0          0          0          0          0        0.0        0.0
++   18           0          0          0          0          0          0        0.0        0.0
++   19           0          0          0          0          0          0        0.0        0.0
++   20           0          0          0          0          0          0        0.0        0.0
++   21           0          0          0          0          0          0        0.0        0.0
++   22           0          0          0          0          0          0        0.0        0.0
++   23           0          0          0          0          0          0        0.0        0.0
++   24           0          0          0          0          0          0        0.0        0.0
++   25           0          0          0          0          0          0        0.0        0.0
++   26           0          0          0          0          0          0        0.0        0.0
++   27           0          0          0          0          0          0        0.0        0.0
++   28           0          0          0          0          0          0        0.0        0.0
++   29           0          0          0          0          0          0        0.0        0.0
++   30           0          0          0          0          0          0        0.0        0.0
++   31           0          0          0          0          0          0        0.0        0.0
++ Total        127          3          9          0          0          0       1.5K       12.0
++-----------------------------------------------------------------------------------------------
+\ No newline at end of file
+-- 
+2.29.2
+
diff --git a/1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch b/1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch
new file mode 100644
index 0000000..043b7b4
--- /dev/null
+++ b/1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch
@@ -0,0 +1,75 @@
+From 6ba4dc1fb110b1db4a3ba187fa60a152f64c413f Mon Sep 17 00:00:00 2001
+From: Yonghong Song <yhs@fb.com>
+Date: Fri, 16 Oct 2020 11:12:53 -0700
+Subject: [PATCH] fix python3 compatible issue for netqtop
+
+Otherwise, we have:
+  ...
+    File "bcc/tools/netqtop.py", line 54
+      print(hd.center(COL_WIDTH)),
+                                 ^
+  TabError: inconsistent use of tabs and spaces in indentation
+  ...
+
+    File "bcc/tools/tcprtt.py", line 117, in <module>
+    bpf_text = bpf_text.replace(b'LPORTFILTER', b'')
+  TypeError: replace() argument 1 must be str, not bytes
+  ...
+
+Signed-off-by: Yonghong Song <yhs@fb.com>
+---
+ tools/netqtop.py | 10 +++++-----
+ 2 files changed, 17 insertions(+), 17 deletions(-)
+
+diff --git a/tools/netqtop.py b/tools/netqtop.py
+index e2823ac6..dbe9206a 100755
+--- a/tools/netqtop.py
++++ b/tools/netqtop.py
+@@ -51,7 +51,7 @@ def print_table(table, qnum):
+         headers.append("PPS")
+ 
+     for hd in headers:
+-		print(hd.center(COL_WIDTH)),
++        print(hd.center(COL_WIDTH))
+     print
+ 
+     # ------- calculates --------------
+@@ -180,7 +180,7 @@ if len(dev_name) > IFNAMSIZ-1:
+ 
+ print_interval = args.interval + 0.0
+ if print_interval == 0:
+-    print "print interval must be non-zero"
++    print ("print interval must be non-zero")
+     exit()
+ 
+ ################ get number of queues #####################
+@@ -188,7 +188,7 @@ tx_num = 0
+ rx_num = 0
+ path = ROOT_PATH + "/" + dev_name + "/queues"
+ if not os.path.exists(path):
+-	print "Net interface", dev_name, "does not exits."
++	print ("Net interface", dev_name, "does not exits.")
+ 	exit()
+ 
+ list = os.listdir(path)
+@@ -199,7 +199,7 @@ for s in list:
+         tx_num += 1
+ 
+ if tx_num > MAX_QUEUE_NUM or rx_num > MAX_QUEUE_NUM:
+-    print "number of queues over 1024 is not supported."
++    print ("number of queues over 1024 is not supported.")
+     exit()
+ 
+ ################## start tracing ##################
+@@ -207,7 +207,7 @@ b = BPF(src_file = EBPF_FILE)
+ # --------- set hash array --------
+ devname_map = b['name_map']
+ _name = Devname()
+-_name.name = dev_name
++_name.name = dev_name.encode()
+ devname_map[0] = _name
+ 
+ while 1:
+-- 
+2.29.2
+
diff --git a/1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch b/1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch
new file mode 100644
index 0000000..99e783e
--- /dev/null
+++ b/1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch
@@ -0,0 +1,113 @@
+From ec3747ed6b16f9eec36a204dfbe3506d3778dcb4 Mon Sep 17 00:00:00 2001
+From: Hao <1075808668@qq.com>
+Date: Mon, 19 Oct 2020 22:21:10 +0800
+Subject: [PATCH] fix netqtop python3 compatible (#3140)
+
+* fix netqtop python3 compatible
+* delete import types
+---
+ tools/netqtop.py | 27 ++++++++++++++-------------
+ 1 file changed, 14 insertions(+), 13 deletions(-)
+
+diff --git a/tools/netqtop.py b/tools/netqtop.py
+index dbe9206a..47e9103e 100755
+--- a/tools/netqtop.py
++++ b/tools/netqtop.py
+@@ -1,11 +1,11 @@
+ #!/usr/bin/python
+ 
++from __future__ import print_function
+ from bcc import BPF
+ from ctypes import *
+ import argparse
+ import os
+ from time import sleep,time,localtime,asctime
+-import types
+ 
+ # pre defines -------------------------------
+ ROOT_PATH = "/sys/class/net"
+@@ -28,7 +28,7 @@ def to_str(num):
+     elif num > 1000:
+         return str(round(num/1024.0, 2)) + 'K'
+     else:
+-        if type(num) == types.FloatType:
++        if isinstance(num, float):
+             return str(round(num, 2))
+         else:
+             return str(num)
+@@ -50,9 +50,10 @@ def print_table(table, qnum):
+         headers.append("BPS")
+         headers.append("PPS")
+ 
++    print(" ", end="")
+     for hd in headers:
+-        print(hd.center(COL_WIDTH))
+-    print
++        print( "%-11s" % hd, end="")
++    print()
+ 
+     # ------- calculates --------------
+     qids=[]
+@@ -97,7 +98,7 @@ def print_table(table, qnum):
+         avg = 0
+         if data[2] != 0:
+             avg = data[1] / data[2]
+-        print("%5d %11s %10s %10s %10s %10s %10s" % (
++        print(" %-11d%-11s%-11s%-11s%-11s%-11s%-11s" % (
+             data[0],
+             to_str(avg),
+             to_str(data[3]),
+@@ -105,34 +106,34 @@ def print_table(table, qnum):
+             to_str(data[5]),
+             to_str(data[6]),
+             to_str(data[7])
+-        )),
++        ), end="")
+         if args.throughput:
+             BPS = data[1] / print_interval
+             PPS = data[2] / print_interval
+-            print("%10s %10s" % (
++            print("%-11s%-11s" % (
+                 to_str(BPS),
+                 to_str(PPS)
+             ))
+         else:
+-            print
++            print()
+     
+     # ------- print total --------------
+-    print(" Total %10s %10s %10s %10s %10s %10s" % (
++    print(" Total      %-11s%-11s%-11s%-11s%-11s%-11s" % (
+         to_str(tAVG),
+         to_str(tGroup[0]),
+         to_str(tGroup[1]),
+         to_str(tGroup[2]),
+         to_str(tGroup[3]),
+         to_str(tGroup[4])
+-    )),
++    ), end="")
+ 
+     if args.throughput:
+-        print("%10s %10s" % (
++        print("%-11s%-11s" % (
+             to_str(tBPS),
+             to_str(tPPS)
+         ))
+     else:
+-        print
++        print()
+ 
+ 
+ def print_result(b):
+@@ -152,7 +153,7 @@ def print_result(b):
+     if args.throughput:
+         print("-"*95)
+     else:
+-        print("-"*76)
++        print("-"*77)
+ 
+ ############## specify network interface #################
+ parser = argparse.ArgumentParser(description="")
+-- 
+2.29.2
+
diff --git a/1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch b/1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch
new file mode 100644
index 0000000..5d5f66a
--- /dev/null
+++ b/1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch
@@ -0,0 +1,59 @@
+From a55192b26d0a9294ed4e0bcd8170225dad62dd61 Mon Sep 17 00:00:00 2001
+From: zhenwei pi <pizhenwei@bytedance.com>
+Date: Mon, 21 Dec 2020 19:49:36 +0800
+Subject: [PATCH] tools/netqtop: fix rx queue id
+
+Net device driver uses 'skb_record_rx_queue' to mark rx queue
+mapping during receiving a packet. Also need to call
+'skb_get_rx_queue' to get rx queue id.
+
+Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
+---
+ tools/netqtop.c | 22 ++++++++++++++++++----
+ 1 file changed, 18 insertions(+), 4 deletions(-)
+
+diff --git a/tools/netqtop.c b/tools/netqtop.c
+index 52605dda..e64ed7fd 100644
+--- a/tools/netqtop.c
++++ b/tools/netqtop.c
+@@ -95,19 +95,33 @@ TRACEPOINT_PROBE(net, net_dev_start_xmit){
+ }
+ 
+ TRACEPOINT_PROBE(net, netif_receive_skb){
+-    struct sk_buff* skb = (struct sk_buff*)args->skbaddr;
+-    if(!name_filter(skb)){
++    struct sk_buff skb;
++
++    bpf_probe_read(&skb, sizeof(skb), args->skbaddr);
++    if(!name_filter(&skb)){
+         return 0;
+     }
+ 
+-    u16 qid = skb->queue_mapping;
++    /* case 1: if the NIC does not support multi-queue feature, there is only
++     *         one queue(qid is always 0).
++     * case 2: if the NIC supports multi-queue feature, there are several queues
++     *         with different qid(from 0 to n-1).
++     * The net device driver should mark queue id by API 'skb_record_rx_queue'
++     * for a recieved skb, otherwise it should be a BUG(all of the packets are
++     * reported as queue 0). For example, virtio net driver is fixed for linux:
++     * commit: 133bbb18ab1a2("virtio-net: per-queue RPS config")
++     */
++    u16 qid = 0;
++    if (skb_rx_queue_recorded(&skb))
++        qid = skb_get_rx_queue(&skb);
++
+     struct queue_data newdata;
+     __builtin_memset(&newdata, 0, sizeof(newdata));
+     struct queue_data *data = rx_q.lookup_or_try_init(&qid, &newdata);
+     if(!data){
+         return 0;
+     }
+-    updata_data(data, skb->len);
++    updata_data(data, skb.len);
+     
+     return 0;
+ }
+-- 
+2.29.2
+
diff --git a/bcc.spec b/bcc.spec
index 2ad1de7..c1bb3f1 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -28,6 +28,10 @@ Patch1001:      1001-bcc-anolis-tests-test_array.py-add-clock_nanosleep-to-attac
 Patch1002:      1002-bcc-anolis-fix-some-test-cases-failed-3235.patch
 Patch1003:      1003-bcc-anolis-test-test_histogram.py-fix-test-failed-on-kernel-5.1.patch
 Patch1004:      1004-bcc-anolis-aarch64-turn-off-jump-table-optimization-during-jit-.patch
+Patch1005:      1005-bcc-anolis-Netqtop-3037-3048.patch
+Patch1006:      1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch
+Patch1007:      1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch
+Patch1008:      1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch
 
 # Arches will be included as upstream support is added and dependencies are
 # satisfied in the respective arches
@@ -228,6 +232,7 @@ done
 * Fri Nov 19 2021 Chunmei Xu <xuchunmei@linux.alibaba.com> - 0.16.0-3.0.1
 - add ali enhanced tools for fs schedule and sys
 - fix testcases: test_array/test_uprobes/test_clang/test_histogram
+- support tools/netqtop
 
 * Mon Feb 22 2021 Jerome Marchand <jmarchan@redhat.com> - 0.16.0-3
 - Fix slabratetop header issues
-- 
Gitee


From 0ce616e9ba55d74e188cb03220e866c93c8e46ef Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Thu, 2 Dec 2021 16:55:55 +0800
Subject: [PATCH 4/5] provide python2-bcc

Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
---
 bcc.spec | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/bcc.spec b/bcc.spec
index c1bb3f1..4df8624 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -1,6 +1,7 @@
 %define anolis_release .0.1
 # luajit is not available RHEL 8
 %bcond_with lua
+%bcond_without python2
 
 %bcond_with llvm_static
 
@@ -53,10 +54,19 @@ BuildRequires:  ncurses-devel
 BuildRequires:  pkgconfig(luajit)
 %endif
 BuildRequires:  libbpf-devel >= 0.0.9, libbpf-static >= 0.0.9
+%if %{with python2}
+BuildRequires: python2 python2-rpm-macros
+%endif
 
 Requires:       %{name}-tools = %{version}-%{release}
 Requires:       libbpf >= 0.0.9
 
+%if %{with python2}
+%global python_cmds python2;python3
+%else
+%global python_cmds python3
+%endif
+
 %description
 BCC is a toolkit for creating efficient kernel tracing and manipulation
 programs, and includes several useful tools and examples. It makes use of
@@ -96,6 +106,16 @@ Requires:       %{name}%{?_isa} = %{version}-%{release}
 %description -n python3-%{name}
 Python3 bindings for BPF Compiler Collection (BCC)
 
+%if %{with python2}
+%package -n python2-%{name}
+Summary:        Python2 bindings for BPF Compiler Collection (BCC)
+Requires:       %{name}%{?_isa} = %{version}-%{release}
+%{?python_provide:%python_provide python2-%{name}}
+
+%description -n python2-%{name}
+Python2 bindings for BPF Compiler Collection (BCC)
+%endif
+
 
 %if %{with lua}
 %package lua
@@ -124,6 +144,7 @@ Command line tools for BPF Compiler Collection (BCC)
         -DCMAKE_BUILD_TYPE=RelWithDebInfo \
         -DREVISION_LAST=%{version} -DREVISION=%{version} -DPYTHON_CMD=python3 \
         -DCMAKE_USE_LIBBPF_PACKAGE:BOOL=TRUE \
+        -DPYTHON_CMD="%{python_cmds}" \
         %{?with_llvm_shared:-DENABLE_LLVM_SHARED=1}
 %make_build
 
@@ -185,6 +206,11 @@ done
 %files -n python3-%{name}
 %{python3_sitelib}/%{name}*
 
+%if %{with python2}
+%files -n python2-%{name}
+%{python2_sitelib}/%{name}*
+%endif
+
 %files doc
 # % dir % {_docdir}/% {name}
 %doc %{_datadir}/%{name}/examples/
@@ -233,6 +259,7 @@ done
 - add ali enhanced tools for fs schedule and sys
 - fix testcases: test_array/test_uprobes/test_clang/test_histogram
 - support tools/netqtop
+- provides python2-bcc
 
 * Mon Feb 22 2021 Jerome Marchand <jmarchan@redhat.com> - 0.16.0-3
 - Fix slabratetop header issues
-- 
Gitee


From 930e7fc2796aa6f4fdd608009a2b3aa35ea162dc Mon Sep 17 00:00:00 2001
From: Chunmei Xu <xuchunmei@linux.alibaba.com>
Date: Mon, 6 Dec 2021 11:10:51 +0800
Subject: [PATCH 5/5] set larger range to get probes evnet for test_usdt2.py

Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
---
 ...e-to-get-probes-evnet-for-test_usdt2.patch | 42 +++++++++++++++++++
 bcc.spec                                      |  2 +
 2 files changed, 44 insertions(+)
 create mode 100644 1009-bcc-anolis-set-larger-range-to-get-probes-evnet-for-test_usdt2.patch

diff --git a/1009-bcc-anolis-set-larger-range-to-get-probes-evnet-for-test_usdt2.patch b/1009-bcc-anolis-set-larger-range-to-get-probes-evnet-for-test_usdt2.patch
new file mode 100644
index 0000000..4ce5953
--- /dev/null
+++ b/1009-bcc-anolis-set-larger-range-to-get-probes-evnet-for-test_usdt2.patch
@@ -0,0 +1,42 @@
+From 4354c9ce33496df82c8ffd30497141daae28afaa Mon Sep 17 00:00:00 2001
+From: Chunmei Xu <xuchunmei@linux.alibaba.com>
+Date: Tue, 13 Oct 2020 17:45:48 +0800
+Subject: [PATCH] set larger range to get probes evnet for test_usdt2.py
+
+I test test_usdt2.py on aarch64 with kernel-5.9-rc5,
+gcc-9.2.1 and glibc2.32.
+with the current range of 3, always get failure:
+======================================================================
+FAIL: test_attach1 (__main__.TestUDST)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "test_usdt2.py", line 170, in test_attach1
+    self.assertTrue(self.evt_st_6 == 1)
+AssertionError: False is not true
+
+----------------------------------------------------------------------
+Ran 1 test in 1.068s
+
+FAILED (failures=1)
+
+Signed-off-by: Chunmei Xu <xuchunmei@linux.alibaba.com>
+---
+ tests/python/test_usdt2.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tests/python/test_usdt2.py b/tests/python/test_usdt2.py
+index a2f46110..c8c47fe6 100755
+--- a/tests/python/test_usdt2.py
++++ b/tests/python/test_usdt2.py
+@@ -149,7 +149,7 @@ int do_trace3(struct pt_regs *ctx) {
+         b["event6"].open_perf_buffer(print_event6)
+ 
+         # three iterations to make sure we get some probes and have time to process them
+-        for i in range(3):
++        for i in range(5):
+             b.perf_buffer_poll()
+ 
+         # note that event1 and event4 do not really fire, so their state should be 0
+-- 
+2.19.1.6.gb485710b
+
diff --git a/bcc.spec b/bcc.spec
index 4df8624..0c9bdb3 100644
--- a/bcc.spec
+++ b/bcc.spec
@@ -33,6 +33,7 @@ Patch1005:      1005-bcc-anolis-Netqtop-3037-3048.patch
 Patch1006:      1006-bcc-anolis-fix-python3-compatible-issue-for-netqtop.patch
 Patch1007:      1007-bcc-anolis-fix-netqtop-python3-compatible-3140.patch
 Patch1008:      1008-bcc-anolis-tools-netqtop-fix-rx-queue-id.patch
+Patch1009:      1009-bcc-anolis-set-larger-range-to-get-probes-evnet-for-test_usdt2.patch
 
 # Arches will be included as upstream support is added and dependencies are
 # satisfied in the respective arches
@@ -260,6 +261,7 @@ done
 - fix testcases: test_array/test_uprobes/test_clang/test_histogram
 - support tools/netqtop
 - provides python2-bcc
+- set larger range to get probes evnet for test_usdt2.py
 
 * Mon Feb 22 2021 Jerome Marchand <jmarchan@redhat.com> - 0.16.0-3
 - Fix slabratetop header issues
-- 
Gitee