From 841f5574a6537d9916060dc591c35475acc8a86a Mon Sep 17 00:00:00 2001 From: Bo Ren Date: Fri, 30 Jun 2023 15:51:47 +0800 Subject: [PATCH 1/2] update to bcc-0.25.0-2.el8 Signed-off-by: Bo Ren --- ...i-enhanced-tools-for-fs-schedule-sys.patch | 6172 ----------------- bcc-0.19.0-Define-missing-BPF_-macros.patch | 27 - bcc-0.19.0-Fix-BPF-src_file-foo.patch | 39 - bcc-0.19.0-Fix-a-llvm-compilation-error.patch | 46 - ...of-task_struct_-state-field-on-RHEL-.patch | 110 - ...-APSInt-toString-std-string-variants.patch | 41 - bcc-0.19.0-fix-llvm-compilation-errors.patch | 42 - ...-bpf_pseudo_fd-type-conversion-error.patch | 77 + ...-15-int-to-pointer-conversion-errors.patch | 96 + bcc-0.25.0-Fix-get_kprobe_functions.patch | 30 + ...0-Manpages-remove-unstable-statement.patch | 203 +- bcc-0.25.0-RHEL-libbpf-version-fixes.patch | 684 ++ ...paccept-Fix-support-for-v5.6-kernels.patch | 42 + ...nel_struct_has_field-to-reach-field-.patch | 75 + bcc.spec | 42 +- download | 2 +- 16 files changed, 1154 insertions(+), 6574 deletions(-) delete mode 100644 1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch delete mode 100644 bcc-0.19.0-Define-missing-BPF_-macros.patch delete mode 100644 bcc-0.19.0-Fix-BPF-src_file-foo.patch delete mode 100644 bcc-0.19.0-Fix-a-llvm-compilation-error.patch delete mode 100644 bcc-0.19.0-Handle-renaming-of-task_struct_-state-field-on-RHEL-.patch delete mode 100644 bcc-0.19.0-Remove-APInt-APSInt-toString-std-string-variants.patch delete mode 100644 bcc-0.19.0-fix-llvm-compilation-errors.patch create mode 100644 bcc-0.25.0-Fix-bpf_pseudo_fd-type-conversion-error.patch create mode 100644 bcc-0.25.0-Fix-clang-15-int-to-pointer-conversion-errors.patch create mode 100644 bcc-0.25.0-Fix-get_kprobe_functions.patch rename bcc-0.19.0-Manpages-remove-unstable-statement.patch => bcc-0.25.0-Manpages-remove-unstable-statement.patch (89%) create mode 100644 bcc-0.25.0-RHEL-libbpf-version-fixes.patch create mode 100644 bcc-0.25.0-Revert-tools-tcpaccept-Fix-support-for-v5.6-kernels.patch create mode 100644 bcc-0.25.0-libbpf-Allow-kernel_struct_has_field-to-reach-field-.patch diff --git a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch b/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch deleted file mode 100644 index e6348c7..0000000 --- a/1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch +++ /dev/null @@ -1,6172 +0,0 @@ -diff --git a/man/man8/alibiolatency.8 b/man/man8/alibiolatency.8 -new file mode 100644 -index 0000000..93f52fa ---- /dev/null -+++ b/man/man8/alibiolatency.8 -@@ -0,0 +1,76 @@ -+.TH alibiolatency 8 "2019-08-25" "USER COMMANDS" -+.SH NAME -+alibiolatency \- Summarize block device I/O latency. -+.SH SYNOPSIS -+.B alibiolatency [\-h] [\-i interval] [-d device] [\-t avg_req_time] [\-T req_time] [-r] -+.SH DESCRIPTION -+ -+Alibiolatency can trace every io request's life cycle and do data filter and statistics -+accordingly. With this tool, we can clearly know io request's different stage's latency -+proportion, for example, if io request D2C(see blktrace) time take majority in total io -+request process time, it's most likely disk bottleneck has been reached. -+ -+This tool uses in-kernel eBPF maps for storing timestamps for efficiency. -+ -+This works by tracing various kernel block tracepoints using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-i interval -+Specify display interval, unit is second. -+.TP -+\-r -+Dump every io request life cycle. -+.TP -+\-d device -+Inspect specified device. -+.TP -+\-t avg_req_time -+Display only when average request process time is greater than this value, unit is ms. -+.TP -+\-T req_time -+Dump request life cycle when single request process time is greater than this value, unit is ms. -+.SH EXAMPLES -+.TP -+Summarize block device I/O latency(default display interval is 2 seconds): -+# -+.B alibiolatency -+.TP -+Print statistics per 5 seconds -+# -+.B alibiolatency \-i 5 -+.TP -+Print only when average request process time is greater than 10ms -+# -+.B alibiolatency \-t 10 -+.TP -+Print request life cycle when single request process time is greater than 20ms. -+# -+.B alibiolatency \-T 20 -+.SH OVERHEAD -+This traces kernel functions and maintains in-kernel timestamps and a histogram, -+which are asynchronously copied to user-space. This method is very efficient, -+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. -+If you have a higher IOPS storage environment, test and quantify the overhead -+before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/aliexitsnoop.8 b/man/man8/aliexitsnoop.8 -new file mode 100644 -index 0000000..cd7215e ---- /dev/null -+++ b/man/man8/aliexitsnoop.8 -@@ -0,0 +1,109 @@ -+.TH aliexitsnoop 8 "2019-05-28" "USER COMMANDS" -+.SH NAME -+aliexitsnoop \- Trace all process termination (exit, fatal signal). Uses Linux eBPF/bcc. -+.SH SYNOPSIS -+.B aliexitsnoop [\-h] [\-t] [\-\-utc] [\-x] [\-p PID] [\-\-label LABEL] -+.SH DESCRIPTION -+aliexitsnoop traces process termination, showing the command name and reason for -+termination, either an exit or a fatal signal. This Linux tool is actually an enhanced -+version of the exitsnoop. For more details about exitsnoop, see exitsnoop_example.txt. -+Compared to the exitsnoop, aliexitsnoop not only print the exit code, pid, ppid, lifetime -+of the exiting processes, but also the stack of the exiting processes. The demand stems -+from the fact that in the production environment, some processes end up unexpectedly. -+Therefore we need to dump the stack of the target processes to find out what caused the -+target process to exit. -+ -+It catches processes of all users, processes in containers, as well -+as processes that become zombie. -+ -+This works by tracing the kernel do_exit() function, -+and will need updating to match any changes to this function. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-t -+Include a timestamp column. -+.TP -+\-\-utc -+Include a timestamp column, use UTC timezone. -+.TP -+\-x -+Exclude successful exits, exit( 0 ) -+.TP -+\-p PID -+Trace this process ID only (filtered in-kernel). -+.TP -+\-\-label LABEL -+Label each line with LABEL (default 'exit') in first column (2nd if timestamp is present). -+.SH EXAMPLES -+.TP -+Trace all process termination -+# -+.B aliexitsnoop -+.TP -+Trace all process termination, and include timestamps: -+# -+.B aliexitsnoop \-t -+.TP -+Exclude successful exits, only include non-zero exit codes and fatal signals: -+# -+.B aliexitsnoop \-x -+.TP -+Trace PID 181 only: -+# -+.B aliexitsnoop \-p 181 -+.TP -+Label each output line with 'EXIT': -+# -+.B aliexitsnoop \-\-label EXIT -+.SH FIELDS -+.TP -+TIME-TZ -+Time of process termination HH:MM:SS.sss with milliseconds, where TZ is -+the local time zone, 'UTC' with \-\-utc option. -+.TP -+LABEL -+The optional label if \-\-label option is used. This is useful with the -+\-t option for timestamps when the output of several tracing tools is -+sorted into one combined output. -+.TP -+PCOMM -+Process/command name. -+.TP -+PID -+Process ID -+.TP -+PPID -+The process ID of the process that will be notified of PID termination. -+.TP -+TID -+Thread ID. -+.TP -+EXIT_CODE -+The exit code for exit() or the signal number for a fatal signal. -+.SH OVERHEAD -+This traces the kernel do_exit() function and prints output for each event. -+As the rate of this is generally expected to be low (< 1000/s), the overhead is also -+expected to be negligible. If you have an application that has a high rate of -+process termination, then test and understand overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Alvin Zheng -+.SH SEE ALSO -+aliexecsnoop(8) -diff --git a/man/man8/aliext4writeslower.8 b/man/man8/aliext4writeslower.8 -new file mode 100644 -index 0000000..007a474 ---- /dev/null -+++ b/man/man8/aliext4writeslower.8 -@@ -0,0 +1,70 @@ -+.TH aliext4writeslower 8 "2019-08-30" "USER COMMANDS" -+.SH NAME -+aliext4writeslower \- Trace common ext4 write(2) slower than a threshold -+.SH SYNOPSIS -+.B aliext4writeslower [-h] [-p PID] [-d device] [min_ms] -+.SH DESCRIPTION -+ -+This script traces key steps of write(2) which maybe time-consuming, -+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction -+checkpoint or commit), page cache allocation, block allocation and -+balance dirty pages, and we also take into account sched delay. -+ -+With this tool, we can clearly know write(2)'s latency proportion. -+ -+This tool uses in-kernel eBPF maps for storing timestamps for efficiency. -+ -+This works by tracing various kernel functions using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-i pid -+trace this PID only -+.TP -+min_ms -+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms. -+.SH EXAMPLES -+.TP -+Trace write(2) slower than 10 ms (default). -+# -+.B aliext4writeslower -+.TP -+Trace write(2) slower than 5 ms. -+# -+.B aliext4writeslower 5 -+.TP -+Trace PID 185 only. -+# -+.B aliext4writeslower -p 185 -+.TP -+Trace /dev/sda5 which contains ext4 fs. -+# -+.B aliext4writeslower -d sda5 -+.TP -+.SH OVERHEAD -+This traces kernel functions and maintains in-kernel timestamps and a histogram, -+which are asynchronously copied to user-space. This method is very efficient, -+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. -+If you have a higher IOPS storage environment, test and quantify the overhead -+before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/alihardirqs.8 b/man/man8/alihardirqs.8 -new file mode 100644 -index 0000000..d874e89 ---- /dev/null -+++ b/man/man8/alihardirqs.8 -@@ -0,0 +1,99 @@ -+.TH alihardirqs 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alihardirqs \- Summarize hard IRQ (interrupt) event time. -+.SH SYNOPSIS -+.B alihardirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes hard irq statistics. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-c -+Show hardIRQ statistics on specific CPU only. -+.TP -+\-i -+Show statistics of specific hardIRQ only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize hardIRQ statistics: -+# -+.B alihardirqs -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alihardirqs 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alihardirqs \-NT 1 -+.TP -+Show hardIRQ statistics as histograms: -+# -+.B alihardirqs \-D -+.TP -+Show hardIRQ statistics of CPU 0 only: -+# -+.B alihardirqs \-c 0 -+.TP -+Show statistics of hardIRQ 25 only: -+# -+.B alihardirqs \-i 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+HARDIRQ -+hardIRQ number. -+.TP -+IRQHandler -+Name of hardIRQ handler. -+.TP -+Count -+Number of samples on specific CPU or hardIRQ. -+.TP -+Total_ -+Number of time spent on specific hardIRQ. -+.TP -+MAX_ -+Maximum of time spent on specific hardIRQ. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every irq_handler_entry/irq_handler_exit tracepoint, and in-kernel -+summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alijbd2stat.8 b/man/man8/alijbd2stat.8 -new file mode 100644 -index 0000000..4f2de07 ---- /dev/null -+++ b/man/man8/alijbd2stat.8 -@@ -0,0 +1,61 @@ -+.TH alijbd2stat 8 "2019-08-25" "USER COMMANDS" -+.SH NAME -+alijbd2stat \- Summarize filesystem jbd2 stat. -+.SH SYNOPSIS -+.B alijbd2stat [\-h] [\-d device] [\-t thresh] -+.SH DESCRIPTION -+ -+Alijbd2stat can trace every jbd2 handle and every transaction commit info. -+ -+If one jbd2 handle's running time is greater than user specified threshold time, Alijbd2stat -+will catch this handle's context. -+ -+This tool uses in-kernel eBPF maps for storing timestamps and the histogram, -+for efficiency. -+ -+This works by tracing various kernel block tracepoints using dynamic tracing, -+and will need updating to match any changes to these tracepoints. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+\-h -+Print usage message. -+.TP -+\-d device -+Inspect specified device. -+.TP -+\-t thresh -+Show jbd2 handle's context when its run time is greater than this threshold. -+.SH EXAMPLES -+.TP -+Print all jbd2 instances stat. -+# -+.B alijbd2stat -+.TP -+Inspect specified device /dev/sda3, on which some filesystem uses jbd2. -+# -+.B alijbd2stat \-d sda3 -+.TP -+Show jbd2 handle's context when it runs more than 10ms -+# -+.B alijbd2stat \-t 10 -+.TP -+.SH OVERHEAD -+Test and quantify the overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Xiaoguang Wang -+.SH SEE ALSO -+biosnoop(8) -diff --git a/man/man8/alimutexsnoop.8 b/man/man8/alimutexsnoop.8 -new file mode 100644 -index 0000000..f0342b4 ---- /dev/null -+++ b/man/man8/alimutexsnoop.8 -@@ -0,0 +1,71 @@ -+.TH alimutexsnoop 8 "2019-05-28" "USER COMMANDS" -+.SH NAME -+alimutexsnoop \- Trace the mutex that is held by the process longer than a specified time threshold. Uses Linux eBPF/bcc. -+.SH SYNOPSIS -+.B alimutexsnoop [\-h] [\-t THRESHOLD] [\-p PID] -+.SH DESCRIPTION -+alimutexsnoop monitors the mutex that is held by the process longer than -+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes, -+it prints the pid of the process that held the mutex and the address of the -+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the -+process. -+ -+This works by tracing the kernel mutex_lock() and mutex_unlock() function, -+and will need updating to match any changes to this function. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-t -+Set the time threshold. The mutexes held longer than this time threshold will be traced(us). -+.TP -+\-p PID -+Trace this process ID only (filtered in-kernel). -+.SH EXAMPLES -+.TP -+Trace all mutexes that are held longer than 1000us (default): -+# -+.B alimutexsnoop -+.TP -+Trace all mutexes that are held longer than 50us: -+# -+.B alimutexsnoop \-t 50 -+.TP -+Trace PID 181 only: -+# -+.B alimutexsnoop \-p 181 -+.SH FIELDS -+.TP -+PCOMM -+Process/command name. -+.TP -+PID -+Process ID -+.TP -+MUTEX_ADDR -+The address of the mutex object. -+.SH OVERHEAD -+This traces the kernel mutex_lock() and mutex_unlock() function and compute the time -+length of the process holds the mutex. The overhead is positively related to the frequency -+of acquiring and releasing mutex. If you have an application that gets and releases mutexes -+frequently, then test and understand overhead before use. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Alvin Zheng -+.SH SEE ALSO -+alimutexsnoop(8) -diff --git a/man/man8/alirunqinfo.8 b/man/man8/alirunqinfo.8 -new file mode 100644 -index 0000000..1ffa3cb ---- /dev/null -+++ b/man/man8/alirunqinfo.8 -@@ -0,0 +1,109 @@ -+.TH alirunqinfo 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alirunqinfo \- Summarize runqueue statistics. -+.SH SYNOPSIS -+.B alirunqinfo [\-h] [\-T] [\-N] [\-D] [\-C] [\-P] [\-c CPU] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes task on-CPU time before being descheduled. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-C -+Show per-CPU runqueue statistics. -+.TP -+\-P -+Show per-thread runqueue statistics. -+.TP -+\-c -+Show runqueue statistics on specific CPU only. -+.TP -+\-p -+Show runqueue statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize runqueue statistics: -+# -+.B alirunqinfo -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alirunqinfo 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alirunqinfo \-NT 1 -+.TP -+Show per-CPU runqueue statistics: -+# -+.B alirunqinfo \-C -+.TP -+Show per-thread runqueue statistics: -+# -+.B alirunqinfo \-P -+.TP -+Show runqueue statistics as histograms: -+# -+.B alirunqinfo \-D -+.TP -+Show runqueue statistics of CPU 0 only: -+# -+.B alirunqinfo \-c 0 -+.TP -+Show runqueue statistics of PID 25 only: -+# -+.B alirunqinfo \-p 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+PID -+PID of sampled process. -+.TP -+Thread -+Name of the specified process. -+.TP -+Count -+Number of samples on specific CPU or PID. -+.TP -+Total_ -+Number of task on-CPU time accumulated on specific CPU or PID. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every sched_switch tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisoftirqs.8 b/man/man8/alisoftirqs.8 -new file mode 100644 -index 0000000..1fc9c50 ---- /dev/null -+++ b/man/man8/alisoftirqs.8 -@@ -0,0 +1,99 @@ -+.TH alisoftirqs 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisoftirqs \- Summarize soft IRQ (interrupt) event time. -+.SH SYNOPSIS -+.B alisoftirqs [\-h] [\-T] [\-N] [\-D] [\-c CPU] [\-i IRQ] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes soft irq statistics. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-c -+Show softIRQ statistics on specific CPU only. -+.TP -+\-i -+Show statistics of specific softIRQ only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize softIRQ statistics: -+# -+.B alisoftirqs -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisoftirqs 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisoftirqs \-NT 1 -+.TP -+Show softIRQ statistics as histograms: -+# -+.B alisoftirqs \-D -+.TP -+Show softIRQ statistics of CPU 0 only: -+# -+.B alisoftirqs \-c 0 -+.TP -+Show statistics of softIRQ 25 only: -+# -+.B alisoftirqs \-i 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+SOFTIRQ -+softIRQ number. -+.TP -+IRQHandler -+Name of softIRQ handler. -+.TP -+Count -+Number of samples on specific CPU or softIRQ. -+.TP -+Total_ -+Number of time spent on specific softIRQ. -+.TP -+MAX_ -+Maximum of time spent on specific softIRQ. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every softirq_entry/softirq_exit tracepoint, and in-kernel -+summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisyscount.8 b/man/man8/alisyscount.8 -new file mode 100644 -index 0000000..284084a ---- /dev/null -+++ b/man/man8/alisyscount.8 -@@ -0,0 +1,112 @@ -+.TH alisyscount 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisyscount \- Summarize syscall counts and latencies. -+.SH SYNOPSIS -+.B alisyscount [\-h] [\-T] [\-N] [\-D] [\-S] [\-P] [\-s syscall] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes syscall counts and latencies. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-D -+Show distributions as histograms. -+.TP -+\-S -+Show per-syscall statistics. -+.TP -+\-P -+Show per-thread statistics. -+.TP -+\-s -+Show statistics of specific syscall only. -+.TP -+\-p -+Show syscall statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize syscall statistics: -+# -+.B alisyscount -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisyscount 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisyscount \-NT 1 -+.TP -+Show syscall statistics as histograms: -+# -+.B alisyscount \-D -+.TP -+Show per-syscall statistics: -+# -+.B alisyscount \-S -+.TP -+Show per-thread syscall statistics: -+# -+.B alisyscount \-P -+.TP -+Show statistics of syscall 25 only: -+# -+.B alisyscount \-s 25 -+.TP -+Show syscall statistics of PID 1213 only: -+# -+.B alisyscount \-p 1213 -+.SH FIELDS -+.TP -+SysNR -+syscall number. -+.TP -+Syscall -+Name of syscall. -+.TP -+PID -+PID of specific process. -+.TP -+Thread -+Name of specific process. -+.TP -+Count -+Number of samples on specific PID or syscall. -+.TP -+Total_ -+Number of time spent on specific syscall. -+.TP -+distribution -+An ASCII bar chart to visualize the distribution (Total_ column) -+.SH OVERHEAD -+This sampls at every sys_enter/sys_exit tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisysdelay.8 b/man/man8/alisysdelay.8 -new file mode 100644 -index 0000000..a7d91aa ---- /dev/null -+++ b/man/man8/alisysdelay.8 -@@ -0,0 +1,71 @@ -+.TH alisysdelay 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisysdelay \- Catches tasks hung in kernel mode and get its stack. -+.SH SYNOPSIS -+.B alisysdelay [\-h] [\-d] [\-a] [\-f frequency] [\-t threshold] [\-\-stack\-storage\-size size] -+.SH DESCRIPTION -+This program catches tasks hung in kernel mode and get its stack. -+ -+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+may be unacceptable, because some tasks may run too long in kernel mode -+without calling cond_sched(). These tasks may be kernel thread, or user -+processes that enter kernel mode by syscall. -+ -+alisysdelay can catch tasks that have been running continuously for a long time -+whithout scheduling, and kernel/user stackes of these tasks will be printed. -+The threshold can be user defined. -+ -+By default CPU idle stacks are excluded by simply excluding PID 0. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-d -+Insert delimiter between kernel/user stacks. -+.TP -+\-a -+Add _[k] annotations to kernel frames. -+.TP -+\-f -+Sample frequency in Hertz. -+.TP -+\-t -+Set the threshold of delay in kernel mode, in us. -+.TP -+\-\-stack\-storage\-size -+The number of unique stack traces that can be stored and displayed. -+.SH EXAMPLES -+.TP -+Catches tasks hung in kernel mode and get its stack: -+# -+.B alisysdelay -+.TP -+Catches tasks hung in kernel mode at 99 Hz: -+# -+.B alisysdelay \-f 99 -+.TP -+Catches tasks that hung in kernel mode for more than 100us: -+# -+.B alisysdelay \-t 100 -+.SH FIELDS -+.SH OVERHEAD -+This sampls at every sys_enter/sched_switch tracepoint and _cond_resched kprobe, -+and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/alisyslatency.8 b/man/man8/alisyslatency.8 -new file mode 100644 -index 0000000..854b7a0 ---- /dev/null -+++ b/man/man8/alisyslatency.8 -@@ -0,0 +1,85 @@ -+.TH alisyslatency 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+alisyslatency \- Summarize time spent in kernel context.. -+.SH SYNOPSIS -+.B alisyslatency [\-h] [\-T] [\-N] [\-c CPU] [\-p PID] [interval] [outputs] -+.SH DESCRIPTION -+This program summarizes time spent in kernel context.. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+\-T -+Include timestamps on output. -+.TP -+\-N -+Output in nanosecond. -+.TP -+\-c -+Show statistics on specific CPU only. -+.TP -+\-p -+Show statistics of specific PID only. -+.TP -+interval -+Output interval, in seconds. -+.TP -+outputs -+Number of outputs. -+.SH EXAMPLES -+.TP -+Summarize time in kernel context per CPU: -+# -+.B alisyslatency -+.TP -+Print 1 second summaries, 10 times: -+# -+.B alisyslatency 1 10 -+.TP -+Print 1s summaries, nanoseconds, and timestamps: -+# -+.B alisyslatency \-NT 1 -+.TP -+Show time in kernel context in CPU 0 only: -+# -+.B alisyslatency \-c 0 -+.TP -+Show time in kernel context in pid 25 only: -+# -+.B alisyslatency \-p 25 -+.SH FIELDS -+.TP -+CPU -+CPU index. -+.TP -+PID -+PID of sampled process. -+.TP -+Count -+Number of samples on specific CPU or PID. -+.TP -+Total_ -+Number of time spent on specific PID. -+.TP -+MAX_ -+Maximum of time spent on specific PID. -+.SH OVERHEAD -+This sampls at every user_exit/user_enter tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/man/man8/aliworkslower.8 b/man/man8/aliworkslower.8 -new file mode 100644 -index 0000000..4dac8fb ---- /dev/null -+++ b/man/man8/aliworkslower.8 -@@ -0,0 +1,59 @@ -+.TH aliworkslower 8 "2019-7-25" "USER COMMANDS" -+.SH NAME -+aliworkslower \- Trace high work sched latency. -+.SH SYNOPSIS -+.B aliworkslower [\-h] [threshold] -+.SH DESCRIPTION -+This program traces high work sched latency. -+ -+Since this uses BPF, only the root user can use this tool. -+.SH REQUIREMENTS -+CONFIG_BPF and bcc. -+.SH OPTIONS -+.TP -+\-h -+Print usage message. -+.TP -+[threshold] -+Set threshold of Q2S latency, in microsecond. -+.SH EXAMPLES -+.TP -+Trace work sched latency higher than 10000 us (default): -+# -+.B aliworkslower -+.TP -+Trace work sched latency higher than 1000 us: -+# -+.B aliworkslower 1000 -+.SH FIELDS -+.TP -+Worker Thread -+PID of worker thread. -+.TP -+Work -+Work callback. -+.TP -+Total Q2S latency -+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us. -+.TP -+Q2A latency -+Latency between workqueue_queue_work (Q) and workqueue_execute_start (S), in us. -+.TP -+A2S latency -+Latency between workqueue_activate_work (A) and workqueue_execute_start (S), in us. -+.SH OVERHEAD -+This sampls at every workqueue_queue_work/workqueue_activate_work/ -+workqueue_execute_start tracepoint, and in-kernel summaries. -+.SH SOURCE -+This is from bcc. -+.IP -+https://github.com/iovisor/bcc -+.PP -+Also look in the bcc distribution for a companion _examples.txt file containing -+example usage, output, and commentary for this tool. -+.SH OS -+Linux -+.SH STABILITY -+Unstable - in development. -+.SH AUTHOR -+Jeffle Xu -diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py -index 9222e37..db51793 100755 ---- a/tests/python/test_tools_smoke.py -+++ b/tests/python/test_tools_smoke.py -@@ -421,5 +421,53 @@ class SmokeTests(TestCase): - # Fails to attach the probe if zfs is not installed. - pass - -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alirunqinfo(self): -+ self.run_with_duration("alirunqinfo.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alihardirqs(self): -+ self.run_with_duration("alihardirqs.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisoftirqs(self): -+ self.run_with_duration("alisoftirqs.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisyscount(self): -+ self.run_with_duration("alisyscount.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisysdelay(self): -+ self.run_with_int("alisysdelay.py", timeout=10, kill_timeout=15) -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alisyslatency(self): -+ self.run_with_duration("alisyslatency.py 1 1") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_aliworkslower(self): -+ self.run_with_int("aliworkslower.py", timeout=10, kill_timeout=15) -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_aliexitsnoop(self): -+ self.run_with_int("aliexitsnoop.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alimutexsnoop(self): -+ self.run_with_int("alimutexsnoop.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alibiolatency(self): -+ self.run_with_int("alibiolatency.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alijbd2stat(self): -+ self.run_with_int("alijbd2stat.py ") -+ -+ @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") -+ def test_alijbd2stat(self): -+ self.run_with_int("aliext4writeslower.py ") -+ - if __name__ == "__main__": - main() -diff --git a/tools/alibiolatency.py b/tools/alibiolatency.py -new file mode 100755 -index 0000000..07f98d3 ---- /dev/null -+++ b/tools/alibiolatency.py -@@ -0,0 +1,532 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alibiolatency Summarize block device I/O latency. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alibiolatency [-h] [-i interval] [-d device] [-t avg_req_time] [-T req_time] [-r] -+# -+# Copyright (c) 2019-2021 Alibaba Group. -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019/07/01 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import ctypes as ct -+import argparse -+import time -+ -+# arguments -+examples = """examples: -+ ./alibiolatency # summarize block I/O latency(default display interval is 2s) -+ ./alibiolatency -d sda3 # inspect specified device /dev/sda3 -+ ./alibiolatency -i 2 # specify display interval, 2s -+ ./alibiolatency -t 10 # display only when average request process time is greater than 10ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -r # dump every io request life cycle -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize block device I/O latency", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("-i", "--dis_interval", nargs="?", default=2, -+ help="specify display interval") -+parser.add_argument("-t", "--avg_threshold_time", nargs="?", default=0, -+ help="display only when average request process time is greater than this value") -+parser.add_argument("-T", "--threshold_time", nargs="?", default=999999999, -+ help="dump request life cycle when single request process time is greater than this value") -+parser.add_argument("-r", "--dump_raw", action="store_true", help="dump every io request life cycle") -+parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) -+args = parser.parse_args() -+threshold_time = int(args.threshold_time) -+avg_threshold_time = int(args.avg_threshold_time) -+dis_interval = int(args.dis_interval) -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+ -+struct req_stat { -+ dev_t dev; -+ u64 queue_us; -+ u64 getrq_us; -+ u64 insert_us; -+ u64 issue_us; -+ u64 completion_us; -+ u64 sector; -+ u32 num_sectors; -+ char rwbs[8]; -+}; -+ -+typedef struct disk_key { -+ dev_t dev; -+ u64 sector; -+} disk_key_t; -+ -+BPF_HASH(io_requests, disk_key_t, struct req_stat); -+BPF_PERF_OUTPUT(req_stat); -+ -+ -+TRACEPOINT_PROBE(block, block_bio_queue) -+{ -+ disk_key_t key = {}; -+ struct req_stat s = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ s.dev = key.dev; -+ s.queue_us = bpf_ktime_get_ns() / 1000; -+ bpf_probe_read(s.rwbs, 8, args->rwbs); -+ io_requests.update(&key, &s); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_getrq) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->getrq_us = bpf_ktime_get_ns() / 1000; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_bio_backmerge) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_bio_frontmerge) -+{ -+ struct req_stat *r; -+ struct req_stat *r2; -+ disk_key_t key = {}; -+ disk_key_t key2 = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector + args->nr_sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ key2.dev = args->dev; -+ key2.sector = args->sector; -+ r2 = io_requests.lookup(&key2); -+ if (r2 == NULL) { -+ io_requests.delete(&key); -+ return 0; -+ } -+ -+ // keep old value. -+ r2->queue_us = r->queue_us; -+ r2->getrq_us = r->getrq_us; -+ r2->insert_us = r->insert_us; -+ r2->issue_us = r->issue_us; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_insert) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->insert_us = bpf_ktime_get_ns() / 1000; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_issue) -+{ -+ struct req_stat *r; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ r->issue_us = bpf_ktime_get_ns() / 1000; -+ r->num_sectors = args->nr_sector; -+ return 0; -+} -+ -+TRACEPOINT_PROBE(block, block_rq_complete) -+{ -+ struct req_stat *r; -+ struct req_stat val = {}; -+ disk_key_t key = {}; -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = args->dev; -+ key.sector = args->sector; -+ r = io_requests.lookup(&key); -+ if (r == NULL) -+ return 0; -+ -+ /* could be happen. -+ if (r->num_sectors != args->nr_sector) -+ bpf_trace_printk("here"); -+ */ -+ r->completion_us = bpf_ktime_get_ns() / 1000; -+ /* FIXME: should we use r? */ -+ //val = *r; -+ val.dev = r->dev; -+ val.queue_us = r->queue_us; -+ val.getrq_us = r->getrq_us; -+ val.insert_us = r->insert_us; -+ val.issue_us = r->issue_us; -+ val.completion_us = r->completion_us; -+ val.sector = args->sector; -+ val.num_sectors = args->nr_sector; -+ memcpy(val.rwbs, r->rwbs, 8); -+ -+ io_requests.delete(&key); -+ -+ req_stat.perf_submit(args, &val, sizeof(struct req_stat)); -+ return 0; -+} -+ -+int trace_elv_merge_requests(struct pt_regs *ctx, struct request_queue *q, -+ struct request *rq, struct request *next) -+{ -+ disk_key_t key = {}; -+ dev_t dev = (next->rq_disk)->part0.__dev.devt; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ key.dev = dev; -+ key.sector = next->__sector; -+ io_requests.delete(&key); -+ -+ return 0; -+} -+ -+""" -+ -+dev_name={} -+ -+def init_dev_name(): -+ global dev_name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ dev_name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+name2devid={v:k for k,v in dev_name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if args.dump_raw: -+ dump_raw = 1 -+else: -+ dump_raw = 0 -+ -+# code substitutions -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# load BPF program -+b = BPF(text=bpf_text) -+b.attach_kprobe(event="elv_merge_requests", fn_name="trace_elv_merge_requests") -+ -+print("Tracing block device I/O... Hit Ctrl-C to end.") -+ -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("queue_us", ct.c_ulonglong), -+ ("getrq_us", ct.c_ulonglong), -+ ("insert_us", ct.c_ulonglong), -+ ("issue_us", ct.c_ulonglong), -+ ("completion_us", ct.c_ulonglong), -+ ("sector", ct.c_ulonglong), -+ ("num_sectors", ct.c_uint), -+ ("rwbs", ct.c_char * 8) -+ ] -+ -+io_stats={} -+last_dis_time=0 -+ -+class req_stats: -+ def __init__(self, dev): -+ self.dev=dev -+ self.sectors=[0, 0, 0] -+ self.num_reqs=[0, 0, 0] -+ self.lat=[0, 0, 0] -+ self.lat_max=[0, 0, 0] -+ self.lat_min=[99999999, 99999999, 99999999] -+ self.i2d_lat=[0, 0, 0] -+ self.i2d_lat_max=[0, 0, 0] -+ self.i2d_lat_min=[99999999, 99999999, 99999999] -+ self.d2c_lat=[0, 0, 0] -+ self.d2c_lat_max=[0,0,0] -+ self.d2c_lat_min=[99999999, 99999999, 99999999] -+ -+ def update_req_stats(self, io_type, num_sectors, lat, i2d_lat, d2c_lat): -+ self.num_reqs[io_type] += 1 -+ self.sectors[io_type] += num_sectors -+ self.lat[io_type] += lat -+ self.i2d_lat[io_type] += i2d_lat -+ self.d2c_lat[io_type] += d2c_lat -+ -+ if lat > self.lat_max[io_type]: -+ self.lat_max[io_type] = lat; -+ -+ if lat < self.lat_min[io_type]: -+ self.lat_min[io_type] = lat; -+ -+ if i2d_lat > self.i2d_lat_max[io_type]: -+ self.i2d_lat_max[io_type] = i2d_lat -+ -+ if i2d_lat < self.i2d_lat_min[io_type]: -+ self.i2d_lat_min[io_type] = i2d_lat -+ -+ if d2c_lat > self.d2c_lat_max[io_type]: -+ self.d2c_lat_max[io_type] = d2c_lat -+ -+ if d2c_lat < self.d2c_lat_min[io_type]: -+ self.d2c_lat_min[io_type] = d2c_lat -+ -+ def display_req_stats(self, time_interval): -+ global avg_threshold_time -+ -+ ti = time_interval / 1000.0 / 1000.0 -+ -+ print("dev: %s" % dev_name[self.dev]) -+ for io_type in (0, 1, 2): -+ if self.num_reqs[io_type] == 0: -+ continue -+ total_reqs = self.num_reqs[io_type] -+ -+ iops = int(total_reqs / ti) -+ avg_lat = self.lat[io_type] / 1000.0 / total_reqs -+ if avg_lat < avg_threshold_time: -+ continue; -+ -+ speed = self.sectors[io_type] * 512 / 1024 / 1024 / ti -+ avg_req_sz = self.sectors[io_type] * 512 / 1024 / total_reqs -+ -+ i2d_avg = self.i2d_lat[io_type] / 1000.0 / total_reqs -+ i2d_percent = int(i2d_avg / avg_lat * 100) -+ -+ d2c_avg = self.d2c_lat[io_type] / 1000.0 / total_reqs -+ d2c_percent = int(d2c_avg / avg_lat * 100) -+ -+ if io_type == 0: -+ io_type_str="READ" -+ elif io_type == 1: -+ io_type_str="WRITE" -+ else: -+ io_type_str="DISCARD" -+ -+ print(" %s iops:%d avg:%.2f min:%.2f max:%.2f speed:%.2fMB/s avgrq-sz:%dKB" % -+ (io_type_str, iops, avg_lat, self.lat_min[io_type] / 1000.0, -+ self.lat_max[io_type] / 1000.0, speed, avg_req_sz)) -+ -+ print(" [i2d_avg: %.2f percent:%d min:%.2f max:%.2f" % (i2d_avg, -+ i2d_percent, self.i2d_lat_min[io_type] / 1000.0, -+ self.i2d_lat_max[io_type] / 1000.0)) -+ -+ print(" [d2c_avg: %.2f percent:%d min:%.2f max:%.2f" % (d2c_avg, -+ d2c_percent, self.d2c_lat_min[io_type] / 1000.0, -+ self.d2c_lat_max[io_type] / 1000.0)) -+ -+ -+def get_io_type(rwbs): -+ # please see logic in kernel func blk_fill_rwbs() -+ if rwbs[0] == 'R' or rwbs[0:2] == "FR": -+ return 0; -+ elif rwbs[0] == 'W' or rwbs[0:2] == "FW": -+ return 1; -+ return 2 -+ -+def is_exception_req(event): -+ # we count this as a abnomal request... -+ # Currently I still hasn't figured out in which case getrq_us -+ # will be zero... -+ if event.getrq_us == 0: -+ print("dev:%s act:%s sector:%lu bytes:%u exception req Q:%lu G:%lu I:%lu D:%lu C:%lu" % ( -+ dev_name[event.dev], event.rwbs, event.sector, event.num_sectors, event.queue_us, -+ event.getrq_us, event.insert_us, event.issue_us, event.completion_us)) -+ -+def dump_raw_trace_event(dev, act, ts, event): -+ print("%s %s %lu sector:%lu bytes:%u act:%s" % (dev, act, ts / 1000, -+ event.sector, event.num_sectors, event.rwbs)) -+ -+def dump_timeout_req(event): -+ dev = dev_name[event.dev] -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ print("%s" % localtime) -+ print("timeouted request sector:%lu bytes:%u req time:%lu" % (event.sector, -+ event.num_sectors, (event.completion_us - event.queue_us) / 1000)) -+ dump_raw_trace_event(dev, "Q", event.queue_us, event) -+ dump_raw_trace_event(dev, "G", event.getrq_us, event) -+ dump_raw_trace_event(dev, "I", event.insert_us, event) -+ dump_raw_trace_event(dev, "D", event.issue_us, event) -+ dump_raw_trace_event(dev, "C", event.completion_us, event) -+ -+def dump_req_life_cycle(event): -+ dev = dev_name[event.dev] -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ print("%s" % localtime) -+ print("raw request event sector:%lu bytes:%u req time:%lu" % (event.sector, -+ event.num_sectors, (event.completion_us - event.queue_us) / 1000)) -+ dump_raw_trace_event(dev, "Q", event.queue_us, event) -+ dump_raw_trace_event(dev, "G", event.getrq_us, event) -+ dump_raw_trace_event(dev, "I", event.insert_us, event) -+ dump_raw_trace_event(dev, "D", event.issue_us, event) -+ dump_raw_trace_event(dev, "C", event.completion_us, event) -+ -+# process event -+def print_event(cpu, data, size): -+ global io_stats -+ global last_dis_time -+ -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ -+ is_exception_req(event) -+ -+ if dump_raw != 0: -+ dump_req_life_cycle(event) -+ -+ lat = event.completion_us - event.queue_us; -+ -+ if event.issue_us != 0: -+ d2c_lat = event.completion_us - event.issue_us -+ elif event.insert_us != 0: -+ d2c_lat = event.completion_us - event.insert_us -+ elif event.getrq_us != 0: -+ d2c_lat = event.completion_us - event.getrq_us -+ else: -+ d2c_lat = lat -+ -+ # Note: this could happen... -+ # In kernel 4.9, for example, scsi_request_fn calls blk_peek_request() -+ # to fetch a request, but if later scsi driver could not process this -+ # request(see not_ready lable in scsi_request_fn), it will call -+ # blk_requeue_request() to put back this request to block elevator, now -+ # insert_us will be greater than issue_us. -+ # -+ # Later scsi_request_fn will pick this request again, but still it could -+ # not handle this request, see codes in scsi_request_fn -+ # if (!scsi_dev_queue_ready(q, sdev)) -+ # break; -+ # then this request will still be in block elevator, but with RQF_STARTED -+ # marked. -+ # -+ # Finally scsi_request_fn calls blk_peek_request() to process this request, -+ # because RQF_STARTED is set, and this time blk_peek_request won't call -+ # trace_block_rq_issue(), then issue_us also won't be updated. -+ # Now insert_us is greater than issue_us. -+ if event.issue_us < event.insert_us: -+ print("lege %u %s sector:%lu bytes:%u %lu %lu" % (dev_name[event.dev], -+ event.rwbs, event.sector, event.num_sectors, event.insert_us, -+ event.issue_us)) -+ -+ if event.insert_us != 0 and event.issue_us > event.insert_us: -+ i2d_lat = event.issue_us - event.insert_us; -+ else: -+ i2d_lat = 0 -+ io_type = get_io_type(event.rwbs) -+ -+ if last_dis_time == 0: -+ last_dis_time = event.completion_us -+ -+ if not io_stats.has_key(event.dev): -+ io_stats[event.dev] = req_stats(event.dev) -+ -+ if lat > threshold_time * 1000: -+ dump_timeout_req(event) -+ -+ io_stats[event.dev].update_req_stats(io_type, event.num_sectors, lat, i2d_lat, d2c_lat) -+ if event.completion_us > last_dis_time and (event.completion_us - last_dis_time) > dis_interval * 1000 * 1000: -+ time_interval = event.completion_us - last_dis_time; -+ localtime = time.asctime( time.localtime(time.time())) -+ -+ keys = io_stats.keys() -+ keys.sort() -+ print("%s" % localtime) -+ for k in keys: -+ io_stats[k].display_req_stats(time_interval) -+ last_dis_time = event.completion_us -+ -+ print("\n") -+ io_stats.clear() -+ -+ -+b["req_stat"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alibiolatency_example.txt b/tools/alibiolatency_example.txt -new file mode 100644 -index 0000000..70afc6d ---- /dev/null -+++ b/tools/alibiolatency_example.txt -@@ -0,0 +1,119 @@ -+Demonstrations of alibiolatency. -+ -+Alibiolatency can trace every io request's life cycle and do data filter and statistics -+accordingly. With this tool, we can clearly know io request's different stage's latency -+proportion, for example, if io request D2C(see blktrace) time take majority in total io -+request process time, it's most likely disk bottleneck has been reached. -+ -+Usage: -+alibiolatency [-h] [-d DEVICE] [-i [DIS_INTERVAL]] [-t [AVG_THRESHOLD_TIME]] [-T [THRESHOLD_TIME]] [-r] -+ -+Summarize block device I/O latency -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -i [DIS_INTERVAL], --dis_interval [DIS_INTERVAL] -+ specify display interval -+ -t [AVG_THRESHOLD_TIME], --avg_threshold_time [AVG_THRESHOLD_TIME] -+ display only when average request process time is -+ greater than this value -+ -T [THRESHOLD_TIME], --threshold_time [THRESHOLD_TIME] -+ dump request life cycle when single request process -+ time is greater than this value -+ -r, --dump_raw dump every io request life cycle -+ -+examples: -+ ./alibiolatency # summarize block I/O latency(default display interval is 2s) -+ ./alibiolatency -d sda3 # inspect specified device /dev/sda3 -+ ./alibiolatency -i 2 # specify display interval, 2s -+ ./alibiolatency -t 10 # display only when average request process time is greater than 10ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -T 20 # dump request life cycle when single request process time is greater than 20ms -+ ./alibiolatency -r # dump every io request life cycle -+ -+ -+Output example: -+ -+Example 1: -+# Summarize block device I/O latency periodically. -+python alibiolatency.py -+ -+Wed Aug 28 19:13:00 2019 -+dev: sda -+ READ iops:32 avg:15.36 min:0.02 max:143.51 speed:0.00MB/s avgrq-sz:4KB -+ [i2d_avg: 0.00 percent:0 min:0.00 max:0.01 -+ [d2c_avg: 15.26 percent:99 min:0.02 max:143.51 -+ WRITE iops:2 avg:0.06 min:0.04 max:0.09 speed:0.00MB/s avgrq-sz:24KB -+ [i2d_avg: 0.00 percent:2 min:0.00 max:0.00 -+ [d2c_avg: 0.05 percent:76 min:0.03 max:0.06 -+ -+ -+Wed Aug 28 19:13:03 2019 -+dev: sda -+ READ iops:62 avg:2.03 min:0.03 max:16.08 speed:0.68MB/s avgrq-sz:14KB -+ [i2d_avg: 0.00 percent:0 min:0.00 max:0.00 -+ [d2c_avg: 1.99 percent:97 min:0.03 max:16.07 -+ WRITE iops:6 avg:0.06 min:0.03 max:0.12 speed:0.00MB/s avgrq-sz:10KB -+ [i2d_avg: 0.00 percent:5 min:0.00 max:0.01 -+ [d2c_avg: 0.04 percent:76 min:0.03 max:0.07 -+ -+Example 2: -+# Summarize block device I/O latency periodically, and will print io request's -+# complete life cycle when this io request's process time is greater than 20ms. -+python alibiolatency.py -T 20 # Summarize block device I/O latency periodically. -+ -+timeouted request sector:269555712 bytes:1024 req time:379 -+sda Q 3102021522 sector:269555712 bytes:1024 act:W -+sda G 3102021522 sector:269555712 bytes:1024 act:W -+sda I 3102021522 sector:269555712 bytes:1024 act:W -+sda D 3102021522 sector:269555712 bytes:1024 act:W -+sda C 3102021902 sector:269555712 bytes:1024 act:W -+Wed Aug 28 19:28:44 2019 -+timeouted request sector:269556736 bytes:1024 req time:381 -+sda Q 3102021522 sector:269556736 bytes:1024 act:W -+sda G 3102021522 sector:269556736 bytes:1024 act:W -+sda I 3102021522 sector:269556736 bytes:1024 act:W -+sda D 3102021522 sector:269556736 bytes:1024 act:W -+sda C 3102021904 sector:269556736 bytes:1024 act:W -+Wed Aug 28 19:28:45 2019 -+dev: sda -+ READ iops:0 avg:0.07 min:0.07 max:0.07 speed:0.00MB/s avgrq-sz:4KB -+ [i2d_avg: 0.01 percent:17 min:0.01 max:0.01 -+ [d2c_avg: 0.05 percent:73 min:0.05 max:0.05 -+ WRITE iops:173 avg:339.79 min:0.04 max:501.72 speed:69.09MB/s avgrq-sz:408KB -+ [i2d_avg: 0.02 percent:0 min:0.00 max:0.06 -+ [d2c_avg: 339.16 percent:99 min:0.04 max:501.67 -+ -+Example 3: -+# Summarize block device I/O latency periodically, and will print every io -+# request's complete life cycle. -+python alibiolatency.py -r -+ -+Wed Aug 28 19:36:40 2019 -+raw request event sector:52903080 bytes:8 req time:0 -+sda Q 3102498371 sector:52903080 bytes:8 act:WS -+sda G 3102498371 sector:52903080 bytes:8 act:WS -+sda I 3102498371 sector:52903080 bytes:8 act:WS -+sda D 3102498371 sector:52903080 bytes:8 act:WS -+sda C 3102498371 sector:52903080 bytes:8 act:WS -+Wed Aug 28 19:36:40 2019 -+raw request event sector:2157826760 bytes:8 req time:0 -+sda Q 3102498785 sector:2157826760 bytes:8 act:W -+sda G 3102498785 sector:2157826760 bytes:8 act:W -+sda I 3102498785 sector:2157826760 bytes:8 act:W -+sda D 3102498785 sector:2157826760 bytes:8 act:W -+sda C 3102498785 sector:2157826760 bytes:8 act:W -+Wed Aug 28 19:36:42 2019 -+raw request event sector:208390056 bytes:8 req time:0 -+sda Q 3102500593 sector:208390056 bytes:8 act:W -+sda G 3102500593 sector:208390056 bytes:8 act:W -+sda I 3102500593 sector:208390056 bytes:8 act:W -+sda D 3102500593 sector:208390056 bytes:8 act:W -+sda C 3102500593 sector:208390056 bytes:8 act:W -+Wed Aug 28 19:36:42 2019 -+dev: sda -+ WRITE iops:42 avg:0.13 min:0.04 max:0.24 speed:0.00MB/s avgrq-sz:5KB -+ [i2d_avg: 0.01 percent:8 min:0.00 max:0.03 -+ [d2c_avg: 0.06 percent:48 min:0.03 max:0.10 -diff --git a/tools/aliexitsnoop.py b/tools/aliexitsnoop.py -new file mode 100755 -index 0000000..4f0b1a1 ---- /dev/null -+++ b/tools/aliexitsnoop.py -@@ -0,0 +1,302 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+from __future__ import print_function -+ -+import argparse -+import ctypes as ct -+import os -+import platform -+import re -+import signal -+import sys -+ -+from bcc import BPF -+from bcc.utils import printb -+from datetime import datetime -+from time import strftime -+ -+# -+# aliexitsnoop Trace all process termination (exit, fatal signal) -+# For Linux, uses BCC, eBPF. Embedded C. -+# -+# USAGE: aliexitsnoop [-h] [-x] [-t] [--utc] [--label[=LABEL]] [-p PID] -+# -+_examples = """examples: -+ aliexitsnoop # trace all process termination -+ aliexitsnoop -x # trace only fails, exclude exit(0) -+ aliexitsnoop -t # include timestamps (local time) -+ aliexitsnoop --utc # include timestamps (UTC) -+ aliexitsnoop -p 181 # only trace PID 181 -+ aliexitsnoop --label=exit # label each output line with 'exit' -+""" -+""" -+ Exit status (from ): -+ -+ 0 EX_OK Success -+ 2 argparse error -+ 70 EX_SOFTWARE syntax error detected by compiler, or -+ verifier error from kernel -+ 77 EX_NOPERM Need sudo (CAP_SYS_ADMIN) for BPF() system call -+ -+ More information about this script is in bcc/tools/aliexitsnoop_example.txt -+ -+ Copyright 2016 Netflix, Inc. -+ Copyright 2019 Instana, Inc. -+ Copyright 2019 Alibaba, Inc. -+ Licensed under the Apache License, Version 2.0 (the "License") -+ -+ 07-Feb-2016 Brendan Gregg (Netflix) Created execsnoop -+ 04-May-2019 Arturo Martin-de-Nicolas (Instana) Created exitsnoop -+ 13-May-2019 Jeroen Soeters (Instana) Refactor to import as module -+ 16-Jul-2019 Alvin Zheng (Alibaba) Created aliexitsnoop -+""" -+ -+def _getParser(): -+ parser = argparse.ArgumentParser( -+ description="Trace all process termination (exit, fatal signal)", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=_examples) -+ a = parser.add_argument -+ a("-t", "--timestamp", action="store_true", help="include timestamp (local time default)") -+ a("--utc", action="store_true", help="include timestamp in UTC (-t implied)") -+ a("-p", "--pid", help="trace this PID only") -+ a("--label", help="label each line") -+ a("-x", "--failed", action="store_true", help="trace only fails, exclude exit(0)") -+ # print the embedded C program and exit, for debugging -+ a("--ebpf", action="store_true", help=argparse.SUPPRESS) -+ # RHEL 7.6 keeps task->start_time as struct timespec, convert to u64 nanoseconds -+ a("--timespec", action="store_true", help=argparse.SUPPRESS) -+ return parser.parse_args -+ -+ -+class Global(): -+ parse_args = _getParser() -+ args = None -+ argv = None -+ bpf = None -+ SIGNUM_TO_SIGNAME = dict((v, re.sub("^SIG", "", k)) -+ for k, v in signal.__dict__.items() if re.match("^SIG[A-Z]+$", k)) -+ -+ -+class Data(ct.Structure): -+ """Event data matching struct data_t in _embedded_c().""" -+ _TASK_COMM_LEN = 16 # linux/sched.h -+ _pack_ = 1 -+ _fields_ = [ -+ ("start_time", ct.c_ulonglong), # task->start_time, see --timespec arg -+ ("exit_time", ct.c_ulonglong), # bpf_ktime_get_ns() -+ ("pid_tgid", ct.c_ulonglong), -+ ("pid", ct.c_uint), # task->tgid, thread group id == sys_getpid() -+ ("tid", ct.c_uint), # task->pid, thread id == sys_gettid() -+ ("ppid", ct.c_uint), # task->parent->tgid, notified of exit -+ ("exit_code", ct.c_int), -+ ("sig_info", ct.c_uint), -+ ("k_stack_id", ct.c_int), -+ ("u_stack_id", ct.c_int), -+ ("task", ct.c_char * _TASK_COMM_LEN) -+ ] -+ -+def _embedded_c(args): -+ """Generate C program for sched_process_exit tracepoint in kernel/exit.c.""" -+ c = """ -+ EBPF_COMMENT -+ #include -+ BPF_STATIC_ASSERT_DEF -+ BPF_STACK_TRACE(stack_traces,16384); -+ struct data_t { -+ u64 start_time; -+ u64 exit_time; -+ u64 pid_tgid; -+ u32 pid; -+ u32 tid; -+ u32 ppid; -+ int exit_code; -+ u32 sig_info; -+ int k_stack_id; -+ int u_stack_id; -+ char task[TASK_COMM_LEN]; -+ } __attribute__((packed)); -+ -+ BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA); -+ BPF_PERF_OUTPUT(events); -+ -+ int kprobe_do_exit(struct pt_regs * ctx, long exit_code) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ if (FILTER_PID || FILTER_EXIT_CODE) { return 0; } -+ -+ struct data_t data = { -+ .start_time = PROCESS_START_TIME_NS, -+ .exit_time = bpf_ktime_get_ns(), -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .pid = task->tgid, -+ .tid = task->pid, -+ .ppid = task->parent->tgid, -+ .exit_code = task->exit_code >> 8, -+ .sig_info = task->exit_code & 0xFF, -+ }; -+ bpf_get_current_comm(&data.task, sizeof(data.task)); -+ data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID); -+ data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_USER_STACK|BPF_F_REUSE_STACKID); -+ events.perf_submit(ctx, &data, sizeof(data)); -+ return 0; -+ } -+ """ -+ # TODO: this macro belongs in bcc/src/cc/export/helpers.h -+ bpf_static_assert_def = r""" -+ #ifndef BPF_STATIC_ASSERT -+ #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \ -+ extern int bpf_static_assert[(condition) ? 1 : -1] -+ #endif -+ """ -+ code_substitutions = [ -+ ('EBPF_COMMENT', '' if not Global.args.ebpf else _ebpf_comment()), -+ ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def), -+ ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))), -+ ('FILTER_PID', '0' if not Global.args.pid else "task->tgid != %s" % Global.args.pid), -+ ('FILTER_EXIT_CODE', '0' if not Global.args.failed else 'task->exit_code == 0'), -+ ('PROCESS_START_TIME_NS', 'task->start_time' if not Global.args.timespec else -+ '(task->start_time.tv_sec * 1000000000L) + task->start_time.tv_nsec'), -+ ] -+ for old, new in code_substitutions: -+ c = c.replace(old, new) -+ return c -+ -+def _ebpf_comment(): -+ """Return a C-style comment with information about the generated code.""" -+ comment = ('Created by %s at %s:\n\t%s' % -+ (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__)) -+ args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n') -+ return ("\n /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" % -+ (comment, ' '.join(Global.argv), args)) -+ .replace('\n', '\n\t*').replace('\t', ' ')) -+ -+def _print_header(): -+ if Global.args.timestamp: -+ title = 'TIME-' + ('UTC' if Global.args.utc else strftime("%Z")) -+ print("%-13s" % title, end="") -+ if Global.args.label is not None: -+ print("%-6s" % "LABEL", end="") -+ print("%-16s %-6s %-6s %-6s %-7s %-10s" % -+ ("PCOMM", "PID", "PPID", "TID", "AGE(s)", "EXIT_CODE")) -+ -+def _print_event(cpu, data, size): # callback -+ """Print the exit event.""" -+ stack_traces = Global.bpf['stack_traces'] -+ e = ct.cast(data, ct.POINTER(Data)).contents -+ if Global.args.timestamp: -+ now = datetime.utcnow() if Global.args.utc else datetime.now() -+ print("%-13s" % (now.strftime("%H:%M:%S.%f")[:-3]), end="") -+ if Global.args.label is not None: -+ label = Global.args.label if len(Global.args.label) else 'exit' -+ print("%-6s" % label, end="") -+ age = (e.exit_time - e.start_time) / 1e9 -+ print("%-16s %-6d %-6d %-6d %-7.2f " % -+ (e.task.decode(), e.pid, e.ppid, e.tid, age), end="") -+ if e.sig_info == 0: -+ print("0" if e.exit_code == 0 else "code %d" % e.exit_code) -+ else: -+ sig = e.sig_info & 0x7F -+ if sig: -+ print("signal %d (%s)" % (sig, signum_to_signame(sig)), end="") -+ if e.sig_info & 0x80: -+ print(", core dumped ", end="") -+ print() -+ # print the kernel stack -+ if e.k_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.k_stack_id))) -+ for addr in cur_stack: -+ printb(b"Kernel Stack: %-16x %s" % (addr, Global.bpf.ksym(addr))) -+ else: -+ print("No KERNEL STACK FOUND") -+ # print the user stack -+ if e.u_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.u_stack_id))) -+ for addr in cur_stack: -+ printb(b"USER Stack: %-16x %s" % (addr, Global.bpf.sym(addr, e.pid_tgid))) -+ else: -+ print("No USER STACK FOUND") -+ print("=================================") -+ print() -+# ============================= -+# Module: These functions are available for import -+# ============================= -+def initialize(arg_list=sys.argv[1:]): -+ """Trace all process termination. -+ -+ arg_list - list of args, if omitted then uses command line args -+ arg_list is passed to argparse.ArgumentParser.parse_args() -+ -+ For example, if arg_list = [ '-x', '-t' ] -+ args.failed == True -+ args.timestamp == True -+ -+ Returns a tuple (return_code, result) -+ 0 = Ok, result is the return value from BPF() -+ 1 = args.ebpf is requested, result is the generated C code -+ os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message -+ os.EX_SOFTWARE: internal software error, result is error message -+ """ -+ Global.argv = arg_list -+ Global.args = Global.parse_args(arg_list) -+ if Global.args.utc and not Global.args.timestamp: -+ Global.args.timestamp = True -+ if not Global.args.ebpf and os.geteuid() != 0: -+ return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call") -+ if re.match('^3\.10\..*el7.*$', platform.release()): # Centos/Red Hat -+ Global.args.timespec = True -+ for _ in range(2): -+ c = _embedded_c(Global.args) -+ if Global.args.ebpf: -+ return (1, c) -+ try: -+ return (os.EX_OK, BPF(text=c)) -+ except Exception as e: -+ error = format(e) -+ if (not Global.args.timespec and error.find('struct timespec') and error.find('start_time')): -+ print('This kernel keeps task->start_time in a struct timespec.\n' + -+ 'Retrying with --timespec') -+ Global.args.timespec = True -+ continue -+ return (os.EX_SOFTWARE, "BPF error: " + error) -+ except: -+ return (os.EX_SOFTWARE, "Unexpected error: {0}".format(sys.exc_info()[0])) -+ -+def snoop(bpf, event_handler): -+ """Call event_handler for process termination events. -+ -+ bpf - result returned by successful initialize() -+ event_handler - callback function to handle termination event -+ args.pid - Return after event_handler is called, only monitoring this pid -+ """ -+ bpf["events"].open_perf_buffer(event_handler) -+ while True: -+ bpf.perf_buffer_poll() -+ if Global.args.pid: -+ return -+ -+def signum_to_signame(signum): -+ """Return the name of the signal corresponding to signum.""" -+ return Global.SIGNUM_TO_SIGNAME.get(signum, "unknown") -+ -+# ============================= -+# Script: invoked as a script -+# ============================= -+def main(): -+ try: -+ rc, buffer = initialize() -+ Global.bpf = buffer -+ Global.bpf.attach_kprobe(event="do_exit", fn_name="kprobe_do_exit") -+ if rc: -+ print(buffer) -+ sys.exit(0 if Global.args.ebpf else rc) -+ _print_header() -+ snoop(buffer, _print_event) -+ except KeyboardInterrupt: -+ print() -+ sys.exit() -+ -+ return 0 -+ -+if __name__ == '__main__': -+ main() -diff --git a/tools/aliexitsnoop_example.txt b/tools/aliexitsnoop_example.txt -new file mode 100644 -index 0000000..4b60929 ---- /dev/null -+++ b/tools/aliexitsnoop_example.txt -@@ -0,0 +1,78 @@ -+Demonstrations of aliexitsnoop. -+ -+This Linux tool is actually an enhanced version of the exitsnoop. For more details about -+exitsnoop, see exitsnoop_example.txt. Compared to the exitsnoop, aliexitsnoop not only print -+the exit code, pid, ppid, lifetime of the exiting processes, but also the stack of the exiting -+processes. The demand stems from the fact that in the production environment, some processes -+end up unexpectedly. Therefore we need to dump the stack of the target processes to find out -+what caused the target process to exit. -+ -+Usage: -+aliexitsnoop.py [-h] [-t] [--utc] [-p PID] [--label LABEL] [-x] -+ -+Trace all process termination (exit, fatal signal) -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -t, --timestamp include timestamp (local time default) -+ --utc include timestamp in UTC (-t implied) -+ -p PID, --pid PID trace this PID only -+ --label LABEL label each line -+ -x, --failed trace only fails, exclude exit(0) -+ -+examples: -+ aliexitsnoop # trace all process termination -+ aliexitsnoop -x # trace only fails, exclude exit(0) -+ aliexitsnoop -t # include timestamps (local time) -+ aliexitsnoop --utc # include timestamps (UTC) -+ aliexitsnoop -p 181 # only trace PID 181 -+ aliexitsnoop --label=exit # label each output line with 'exit' -+ -+The following example shows the output example when run in a loop that is interrupted with -+Ctrl-C from the terminal: -+ -+grep 25099 25087 25099 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7fd29b10c660 [unknown] -+USER Stack: 7fd29b0611d9 [unknown] -+================================= -+ -+id 25101 25100 25101 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7f932c07b660 [unknown] -+USER Stack: 7f932bfd01d9 [unknown] -+================================= -+ -+bash 25100 25087 25100 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083b54 __x64_sys_exit_group -+Kernel Stack: ffffffffa5083ad9 do_group_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: c74864108b48 [unknown] -+USER Stack: 7f5318862660 [unknown] -+USER Stack: 7f53187b71d9 [unknown] -+================================= -+ -+systemd 1 0 25102 0.00 0 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa5083a97 __x64_sys_exit -+Kernel Stack: ffffffffa5082e11 do_exit -+USER Stack: 7f92232c1e64 [unknown] -+ -+We can see that aliexitsnoop prints the user stack and kernel stack of the exiting processes. -+What needs to be noted is that aliexitsnoop can only get the address track of the user stack. -+The reason why aliexitsnoop cannot get the symbols corresponding the addresses is that the -+user space of the process does not exists after the process ends. However, the address track -+of the user stack still helps, because you can analyze together with /proc/$pid/maps. -diff --git a/tools/aliext4writeslower.py b/tools/aliext4writeslower.py -new file mode 100755 -index 0000000..db626c7 ---- /dev/null -+++ b/tools/aliext4writeslower.py -@@ -0,0 +1,411 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# aliext4writeslower Trace slow ext4 write operation. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: aliext4writeslower [-h] [-p PID] [-d device] [min_ms] -+# -+# This script traces key steps of write(2) which maybe time-consuming, -+# here trace 4 key steps: jbd2 handle start(it may need jbd2 trancation -+# checkpoint or commit), page cache allocation, block allocation and -+# balance dirty pages, and we also take into account sched delay. -+# -+# With this tool, we can clearly know write(2)'s latency proportion. -+# -+# WARNING: This adds low-overhead instrumentation to these ext4 operations, -+# including reads and writes from the file system cache. Such reads and writes -+# can be very frequent (depending on the workload; eg, 1M/sec), at which -+# point the overhead of this tool (even if it prints no "slower" events) can -+# begin to become significant. -+# -+# By default, a minimum millisecond threshold of 10 is used. -+# -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019-08-29 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import argparse -+from time import strftime -+import ctypes as ct -+ -+# symbols -+kallsyms = "/proc/kallsyms" -+ -+# arguments -+examples = """examples: -+ ./aliext4writeslower # trace write(2) slower than 10 ms (default) -+ ./aliext4writeslower 1 # trace write(2) slower than 1 ms -+ ./aliext4writeslower 0 # trace all operations (warning: verbose) -+ ./aliext4writeslower -d sda5 # inspect specified device /dev/sda5 containing ext4 fs -+ ./aliext4writeslower -p 185 # trace PID 185 only -+""" -+parser = argparse.ArgumentParser( -+ description="Trace common ext4 write(2) slower than a threshold", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-p", "--pid", help="trace this PID only") -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("min_ms", nargs="?", default='10', -+ help="minimum I/O duration to trace, in ms (default 10)") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+args = parser.parse_args() -+min_ms = int(args.min_ms) -+pid = args.pid -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+ -+enum { -+ TRACE_PAGECACHE_ALLOC = 0, -+ TRACE_JOURNAL_START, -+ TRACE_BLOCK_ALLOC, -+ TRACE_BALANCE_DIRTY, -+ TRACE_WRITE_BEGIN, -+ TRACE_WRITE_END, -+ TRACE_TYPE_MAX, -+}; -+ -+struct val_t { -+ dev_t dev; -+ u64 ts; -+ u64 sched_delay; -+ u64 trace_ts[TRACE_TYPE_MAX]; -+ u64 trace_time[TRACE_TYPE_MAX]; -+ u64 offset; -+ struct file *fp; -+}; -+ -+struct data_t { -+ dev_t dev; -+ u64 size; -+ u64 offset; -+ u64 delta_us; -+ u64 pagecache_alloc_us; -+ u64 journal_start_us; -+ u64 block_alloc_us; -+ u64 balance_dirty_us; -+ u64 write_begin_us; -+ u64 write_end_us; -+ u64 sched_delay; -+ u64 pid; -+ char task[TASK_COMM_LEN]; -+ char file[DNAME_INLINE_LEN]; -+}; -+ -+BPF_HASH(entryinfo, u64, struct val_t); -+BPF_PERF_OUTPUT(events); -+ -+ -+// ext4_file_write_iter(): -+int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) -+{ -+ struct val_t val = {}; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ struct task_struct *p = (struct task_struct *)bpf_get_current_task(); -+ struct inode *inode = iocb->ki_filp->f_inode; -+ dev_t dev = inode->i_sb->s_bdev->bd_dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ if (FILTER_PID) -+ return 0; -+ -+ val.ts = bpf_ktime_get_ns(); -+ val.sched_delay = p->sched_info.run_delay; -+ val.fp = iocb->ki_filp; -+ val.dev = dev; -+ val.offset = iocb->ki_pos; -+ if (val.fp) -+ entryinfo.update(&id, &val); -+ -+ return 0; -+} -+ -+int trace_write_return(struct pt_regs *ctx) -+{ -+ struct val_t *valp; -+ u64 ts, delta_us, id = bpf_get_current_pid_tgid(); -+ u32 size, pid = id >> 32; -+ struct task_struct *p = (struct task_struct *)bpf_get_current_task(); -+ struct data_t data = {}; -+ -+ if (FILTER_PID) -+ return 0; -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) -+ return 0; -+ -+ ts = bpf_ktime_get_ns(); -+ delta_us = (ts - valp->ts) / 1000; -+ entryinfo.delete(&id); -+ if (FILTER_US) -+ return 0; -+ -+ size = PT_REGS_RC(ctx); -+ data.size = size; -+ data.delta_us = delta_us; -+ data.pid = pid; -+ data.sched_delay = (p->sched_info.run_delay - valp->sched_delay) / 1000; -+ data.offset = valp->offset; -+ data.dev = valp->dev; -+ data.pagecache_alloc_us = valp->trace_time[TRACE_PAGECACHE_ALLOC] / 1000; -+ data.journal_start_us = valp->trace_time[TRACE_JOURNAL_START] / 1000; -+ data.block_alloc_us = valp->trace_time[TRACE_BLOCK_ALLOC] / 1000; -+ data.balance_dirty_us = valp->trace_time[TRACE_BALANCE_DIRTY] / 1000; -+ data.write_begin_us = valp->trace_time[TRACE_WRITE_BEGIN] / 1000; -+ data.write_end_us = valp->trace_time[TRACE_WRITE_END] / 1000; -+ bpf_get_current_comm(&data.task, sizeof(data.task)); -+ -+ // workaround (rewriter should handle file to d_name in one step): -+ struct dentry *de = NULL; -+ struct qstr qs = {}; -+ bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry); -+ bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name); -+ if (qs.len == 0) -+ return 0; -+ bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name); -+ -+ // output -+ events.perf_submit(ctx, &data, sizeof(data)); -+ -+ return 0; -+} -+ -+ -+static int trace_type_entry(struct pt_regs *ctx, int type) -+{ -+ struct val_t *valp; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ u64 ts = bpf_ktime_get_ns(); -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) { -+ // missed tracing issue or filtered -+ return 0; -+ } -+ -+ valp->trace_ts[type] = ts; -+ return 0; -+} -+ -+static int trace_type_return(struct pt_regs *ctx, int type) -+{ -+ struct val_t *valp; -+ u64 id = bpf_get_current_pid_tgid(); -+ u32 pid = id >> 32; // PID is higher part -+ u64 ts = bpf_ktime_get_ns(); -+ -+ valp = entryinfo.lookup(&id); -+ if (valp == 0) { -+ // missed tracing issue or filtered -+ return 0; -+ } -+ -+ valp->trace_time[type] += ts - valp->trace_ts[type];; -+ return 0; -+} -+ -+int trace_pagecache_alloc_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_PAGECACHE_ALLOC); -+} -+ -+int trace_pagecache_alloc_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_PAGECACHE_ALLOC); -+} -+ -+int trace_journal_start_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_JOURNAL_START); -+} -+ -+int trace_journal_start_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_JOURNAL_START); -+} -+ -+int trace_block_alloc_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_BLOCK_ALLOC); -+} -+ -+int trace_block_alloc_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_BLOCK_ALLOC); -+} -+ -+int trace_balance_dirty_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_BALANCE_DIRTY); -+} -+ -+int trace_balance_dirty_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_BALANCE_DIRTY); -+} -+ -+int trace_aops_write_begin_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_WRITE_BEGIN); -+} -+ -+int trace_aops_write_begin_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_WRITE_BEGIN); -+} -+ -+int trace_aops_write_end_entry(struct pt_regs *ctx) -+{ -+ return trace_type_entry(ctx, TRACE_WRITE_END); -+} -+ -+int trace_aops_write_end_return(struct pt_regs *ctx) -+{ -+ return trace_type_return(ctx, TRACE_WRITE_END); -+} -+""" -+ -+devid2name={} -+num_trans=0 -+ -+def init_dev_name(): -+ global devid2name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ devid2name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+ -+name2devid={v:k for k,v in devid2name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if min_ms == 0: -+ bpf_text = bpf_text.replace('FILTER_US', '0') -+else: -+ bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_ms * 1000)) -+ -+if args.pid: -+ bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) -+else: -+ bpf_text = bpf_text.replace('FILTER_PID', '0') -+ -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# kernel->user event data: struct data_t -+DNAME_INLINE_LEN = 32 # linux/dcache.h -+TASK_COMM_LEN = 16 # linux/sched.h -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("size", ct.c_ulonglong), -+ ("offset", ct.c_ulonglong), -+ ("delta_us", ct.c_ulonglong), -+ ("pagecache_alloc_us", ct.c_ulonglong), -+ ("journal_start_us", ct.c_ulonglong), -+ ("block_alloc_us", ct.c_ulonglong), -+ ("balance_dirty_us", ct.c_ulonglong), -+ ("write_begin_us", ct.c_ulonglong), -+ ("write_end_us", ct.c_ulonglong), -+ ("sched_delay", ct.c_ulonglong), -+ ("pid", ct.c_ulonglong), -+ ("task", ct.c_char * TASK_COMM_LEN), -+ ("file", ct.c_char * DNAME_INLINE_LEN) -+ ] -+ -+# process event -+def print_event(cpu, data, size): -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ -+ print("%-8s %-8s %-14.14s %-6s %-7s %-8d %7.2f %8.2f %10.2f %11.2f %13.2f %11.2f %9.2f %11.2f %s" % -+ (strftime("%H:%M:%S"), devid2name[event.dev], -+ event.task.decode(), event.pid, event.size, event.offset / 1024, -+ float(event.delta_us) / 1000, float(event.pagecache_alloc_us) / 1000, -+ float(event.journal_start_us) / 1000, float(event.block_alloc_us) / 1000, -+ float(event.balance_dirty_us) / 1000, float(event.write_begin_us) / 1000, -+ float(event.write_end_us) / 1000, float(event.sched_delay) / 1000, -+ event.file.decode())) -+ -+# initialize BPF -+b = BPF(text=bpf_text) -+ -+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry") -+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") -+ -+# trace address_space_operations .write_begin() -+b.attach_kprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_entry") -+b.attach_kprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_entry") -+b.attach_kretprobe(event="ext4_da_write_begin", fn_name="trace_aops_write_begin_return") -+b.attach_kretprobe(event="ext4_write_begin", fn_name="trace_aops_write_begin_return") -+ -+# trace address_space_operations .write_end() -+b.attach_kprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_entry") -+b.attach_kprobe(event="ext4_write_end", fn_name="trace_aops_write_end_entry") -+b.attach_kretprobe(event="ext4_da_write_end", fn_name="trace_aops_write_end_return") -+b.attach_kretprobe(event="ext4_write_end", fn_name="trace_aops_write_end_return") -+ -+# trace page cache allocation -+b.attach_kprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_entry") -+b.attach_kretprobe(event="grab_cache_page_write_begin", fn_name="trace_pagecache_alloc_return") -+ -+# trace journal start -+b.attach_kprobe(event="jbd2__journal_start", fn_name="trace_journal_start_entry") -+b.attach_kretprobe(event="jbd2__journal_start", fn_name="trace_journal_start_return") -+ -+# trace block allocation -+b.attach_kprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_entry") -+b.attach_kretprobe(event="ext4_da_get_block_prep", fn_name="trace_block_alloc_return") -+b.attach_kprobe(event="_ext4_get_block", fn_name="trace_block_alloc_entry") -+b.attach_kretprobe(event="_ext4_get_block", fn_name="trace_block_alloc_return") -+ -+# trace balance dirty pages -+b.attach_kprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_entry") -+b.attach_kretprobe(event="balance_dirty_pages_ratelimited", fn_name="trace_balance_dirty_return") -+ -+# header -+if min_ms == 0: -+ print("Tracing ext4 operations") -+else: -+ print("Tracing ext4 operations slower than %d ms" % min_ms) -+ -+print("%-8s %-8s %-14s %-6s %-7s %-8s %7s %8s %10s %11s %13s %11s %9s %11s %s" % ("TIME", -+ "DEVICE", "COMM", "PID", "BYTES", "OFF_KB", "LAT(ms)", "pg_alloc", "jbd2_start", -+ "block_alloc", "balance_dirty", "write_begin", "write_end", "sched_delay", "FILENAME")) -+ -+# read events -+b["events"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/aliext4writeslower_example.txt b/tools/aliext4writeslower_example.txt -new file mode 100644 -index 0000000..b702d9d ---- /dev/null -+++ b/tools/aliext4writeslower_example.txt -@@ -0,0 +1,68 @@ -+Demonstrations of aliext4writeslower. -+ -+This script traces key steps of write(2) which maybe time-consuming, -+here trace 4 key steps: jbd2 handle start(it may need jbd2 transaction -+checkpoint or commit), page cache allocation, block allocation and -+balance dirty pages, and we also take into account sched delay. -+With this tool, we can clearly know write(2)'s latency proportion. -+ -+Usage: -+usage: aliext4writeslower [-h] [-p PID] [-d DEVICE] [min_ms] -+ -+Trace common ext4 write(2) slower than a threshold -+ -+positional arguments: -+ min_ms minimum I/O duration to trace, in ms (default 10) -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -p PID, --pid PID trace this PID only -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -+examples: -+ ./aliext4writeslower # trace write(2) slower than 10 ms (default) -+ ./aliext4writeslower 1 # trace write(2) slower than 1 ms -+ ./aliext4writeslower 0 # trace all operations (warning: verbose) -+ ./aliext4writeslower -d sda5 # inspect specified device /dev/sda5 containing ext4 fs -+ ./aliext4writeslower -p 185 # trace PID 185 only -+ -+ -+Output example: -+ -+Example 1: -+# Trace write(2) slower than 5 ms -+python aliext4writeslower.py 5 -+ -+Tracing ext4 operations slower than 5 ms -+TIME DEVICE COMM PID BYTES OFF_KB LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME -+17:14:07 sda5 dd 31149 1048576 473088 5.15 1.02 0.68 0.25 0.54 2.50 1.57 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 474112 5.27 1.07 0.68 0.27 0.54 2.58 1.59 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 475136 5.09 1.01 0.69 0.30 0.53 2.52 1.49 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 607232 5.12 1.09 0.67 0.21 0.54 2.52 1.50 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 637952 6.88 1.20 1.15 0.34 0.75 3.38 1.97 0.00 testfile -+17:14:07 sda5 dd 31149 1048576 674816 5.29 1.17 0.68 0.20 0.55 2.60 1.58 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 738304 5.08 0.99 0.69 0.27 0.54 2.46 1.54 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 790528 5.03 0.94 0.69 0.25 0.54 2.42 1.50 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 803840 5.04 0.96 0.67 0.29 0.54 2.45 1.51 0.00 testfile -+17:14:08 sda5 dd 31149 1048576 870400 5.15 1.10 0.68 0.19 0.55 2.52 1.53 0.00 testfile -+17:14:09 sda5 dd 31149 1048576 986112 5.08 1.00 0.69 0.27 0.54 2.48 1.51 0.00 testfile -+ -+ -+Example 2: -+# Inspect specified device /dev/sda5 containing ext4 fs -+python aliext4writeslower.py -d sda5 -+ -+Tracing ext4 operations -+TIME DEVICE COMM PID BYTES OFF_KB LAT(ms) pg_alloc jbd2_start block_alloc balance_dirty write_begin write_end sched_delay FILENAME -+17:16:24 sda3 su 32894 292 0 0.03 0.00 0.00 0.00 0.00 0.01 0.00 0.00 lastlog -+17:16:24 sda3 syslog-ng 990 67 456973 0.07 0.01 0.01 0.00 0.00 0.02 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 106 456973 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 67 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 syslog-ng 990 106 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 ilogtail 56674 199 3296 0.04 0.00 0.01 0.00 0.00 0.01 0.01 0.00 000147.log -+17:16:24 sda3 syslog-ng 990 95 456974 0.04 0.01 0.01 0.00 0.00 0.01 0.01 0.00 secure-20190805 -+17:16:24 sda3 syslog-ng 990 95 475637 0.02 0.00 0.00 0.00 0.00 0.01 0.01 0.00 messages-20190805 -+17:16:24 sda3 xperf 2558 874 403 0.03 0.00 0.01 0.00 0.00 0.01 0.01 0.00 xperf.pmu.data.0 -+17:16:25 sda3 xperf 2558 819 403 0.06 0.01 0.01 0.00 0.01 0.02 0.01 0.00 xperf.pmu.data.0 -+17:16:25 sda3 syslog-ng 990 175 456974 0.05 0.01 0.01 0.00 0.00 0.01 0.01 0.00 secure-20190805 -diff --git a/tools/alihardirqs.py b/tools/alihardirqs.py -new file mode 100755 -index 0000000..b8b0500 ---- /dev/null -+++ b/tools/alihardirqs.py -@@ -0,0 +1,219 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alihardirqs Summarize hard IRQ (interrupt) event time. -+# Enhanced version of BCC hardirqs. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alihardirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alihardirqs # sum hard irq event time -+ ./alihardirqs -D # show hard irq event time as histograms -+ ./alihardirqs -c 0 # show hard irq event time of CPU 0 only -+ ./alihardirqs -i 25 # show event time of IRQ 25 only -+ ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alihardirqs 1 10 # print 1 second summaries, 10 times -+ ./alihardirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize hard irq statistics, including total count, time, etc.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="output hardIRQ statistics on specific CPU only") -+parser.add_argument("-i", "--irq", -+ help="output statistics of specific hardIRQ") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+distmode = "1" if args.dist else "0" -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct irq_key { -+ u32 cpu; -+ u32 irq; -+ char name[32]; -+} irq_key_t; -+ -+typedef struct irq_key_slot { -+ irq_key_t key; -+ u64 slot; -+} irq_key_slot_t; -+ -+typedef struct irq_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} irq_val_t; -+ -+typedef struct stamp { -+ u64 ts; -+ char name[32]; -+} stamp_t; -+ -+BPF_PERCPU_ARRAY(start, stamp_t, 1); -+BPF_HASH(res, irq_key_t, irq_val_t); -+BPF_HISTOGRAM(dist, irq_key_slot_t); -+ -+ -+TRACEPOINT_PROBE(irq, irq_handler_entry) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ stamp_t stamp = { -+ .ts = bpf_ktime_get_ns(), -+ }; -+ TP_DATA_LOC_READ_CONST(stamp.name, name, sizeof(stamp.name)); -+ start.update(&idx, &stamp); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(irq, irq_handler_exit) -+{ -+ u64 delta; -+ u32 cpu = bpf_get_smp_processor_id(); -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ stamp_t *stampp = start.lookup(&idx); -+ if (!stampp || stampp->ts == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - stampp->ts; -+ irq_key_t key = {.cpu = cpu, .irq = args->irq}; -+ char *name = (char *)stampp->name; -+ bpf_probe_read(key.name, sizeof(key.name), name); -+ -+#if DIST_MODE -+ irq_key_slot_t key_slot = { -+ .key = key, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+#else -+ struct irq_val *valp, val; -+ valp = res.lookup(&key); -+ -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+#endif -+ stamp_t zero = {}; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_filter = '' -+cpu_filter = "1" -+if args.cpu: -+ cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+irq_filter = "1" -+if args.irq: -+ irq_filter = "args->irq == %s" % args.irq -+ -+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter) -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('FACTOR', "%d" % factor) -+bpf_text = bpf_text.replace('DIST_MODE', distmode) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+def print_section(key): -+ return "CPU %d, hardIRQ %d (%s)" % (key[0], key[1], key[2].decode('utf-8', 'replace')) -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing hard irq event time... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ tab.print_log2_hist("time_" + label, "section", -+ section_print_fn=print_section, -+ bucket_fn=lambda k: (k.cpu, k.irq, k.name), -+ bucket_sort_fn=lambda keys: sorted(keys)) -+ else: -+ cur_cpu = -1 -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %20s %10s %10s %10s" % -+ ("HARDIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label)) -+ cur_cpu = k.cpu -+ -+ print("%10d %20s %10d %10d %10d" % -+ (k.irq, k.name.decode('utf-8', 'replace'), v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alihardirqs_example.txt b/tools/alihardirqs_example.txt -new file mode 100644 -index 0000000..caebb47 ---- /dev/null -+++ b/tools/alihardirqs_example.txt -@@ -0,0 +1,93 @@ -+Demonstrations of alihardirqs, the enhanced version of BCC hardirqs. -+ -+This program traces hard interrupts (irqs), and stores timing statistics -+in-kernel for efficiency. -+ -+ -+Example 1 -+ -+By default, all statistics of all hardirq on all CPUs are sampled, -+and the output is displayed in a per-hardirq and per-CPU flavor. -+ -+# alihardirqs -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 2 4 2 -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-hardirq flavor. -+ -+# alihardirqs -c 3 -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 3 10 7 -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified hardirq only with "-i" option, -+and the output will be displayed in a per-CPU flavor. -+ -+# alihardirqs -i 31 -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU3: -+ HARDIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 31 virtio0-input.0 3 8 6 -+ -+ -+Example 4 -+ -+The distribution of hardirq run time can be printed as a histogram with "-D" -+option. The output will be displayed in a per-hardirq and per-CPU flavor. -+ -+# alihardirqs -D -+Tracing hard irq event time... Hit Ctrl-C to end. -+^C -+ -+section = CPU 3, hardIRQ 31 (virtio0-input.0) -+ time_usecs : count distribution -+ 0 -> 1 : 1 |****************************************| -+ 2 -> 3 : 0 | | -+ 4 -> 7 : 1 |****************************************| -+ -+ -+ -+USAGE message: -+ -+# alihardirqs -h -+usage: alihardirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] -+ [interval] [outputs] -+ -+Summarize hard irq statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU output hardIRQ statistics on specific CPU only -+ -i IRQ, --irq IRQ output statistics of specific hardIRQ -+ -+examples: -+ ./alihardirqs # sum hard irq event time -+ ./alihardirqs -D # show hard irq event time as histograms -+ ./alihardirqs -c 0 # show hard irq event time of CPU 0 only -+ ./alihardirqs -i 25 # show event time of IRQ 25 only -+ ./alihardirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alihardirqs 1 10 # print 1 second summaries, 10 times -+ ./alihardirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alijbd2stat.py b/tools/alijbd2stat.py -new file mode 100755 -index 0000000..f727fc1 ---- /dev/null -+++ b/tools/alijbd2stat.py -@@ -0,0 +1,392 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alijbd2stat Summarize filesystem jbd2 stat. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alijbd2stat [-h] [-t thresh] [-d device] -+# -+# Copyright (c) 2019-2021 Alibaba Group. -+# Licensed under the Apache License, Version 2.0 (the "License") -+# -+# 2019/08/05 Xiaoguang Wang Created this. -+ -+from __future__ import print_function -+from bcc import BPF -+import ctypes as ct -+import argparse -+import time -+ -+# arguments -+examples = """examples: -+ ./alijbd2stat # summarize filesystem jbd2 stat -+ ./alijbd2stat -d sda3 # inspect specified device /dev/sda3 -+ ./alijbd2stat -t 10 # show jbd2 handle's context when it runs more than 10ms -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize filesystem jbd2 stat", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-d", "--device", help="inspect specified device") -+parser.add_argument("-t", "--thresh_time", nargs="?", default=0, -+ help="show jbd2 handle's context when its run time is greater than this value") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+args = parser.parse_args() -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+#include -+ -+struct handle_stat { -+ dev_t dev; -+ u32 type; -+ u32 line_no; -+ unsigned long tid; -+ unsigned long pre_start_time; -+ unsigned long start_time; -+ unsigned long end_time; -+ u64 sched_delay; -+ u64 kernel_ip; -+ int kernel_stack_id; -+ u32 pid; -+ char comm[TASK_COMM_LEN]; -+}; -+ -+struct transaction_run_stat { -+ dev_t dev; -+ u32 handle_count; -+ u32 blocks; -+ u32 blocks_logged; -+ unsigned long tid; -+ unsigned long wait; -+ unsigned long request_delay; -+ unsigned long running; -+ unsigned long locked; -+ unsigned long flushing; -+ unsigned long logging; -+}; -+ -+/* -+struct handle_info_per_type { -+ u32 count -+}; -+*/ -+ -+BPF_HASH(handle_stat_map, struct task_struct *, struct handle_stat); -+BPF_STACK_TRACE(stack_traces, 1024); -+BPF_PERF_OUTPUT(jbd2_handle_stat); -+BPF_PERF_OUTPUT(transaction_stat); -+ -+ -+int trace_start_this_handle(struct pt_regs *ctx, journal_t *journal, -+ handle_t *handle, gfp_t gfp_mask) -+{ -+ struct handle_stat s; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ dev_t dev = journal->j_fs_dev->bd_dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ memset(&s, 0, sizeof(struct handle_stat)); -+ s.dev = dev; -+ s.pre_start_time = bpf_ktime_get_ns(); -+ s.sched_delay = t->sched_info.run_delay; -+ -+ handle_stat_map.update(&t, &s); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(jbd2, jbd2_handle_start) -+{ -+ struct handle_stat *s; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ dev_t dev = args->dev; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ s = handle_stat_map.lookup(&t); -+ if (s == NULL) -+ return 0; -+ -+ s->start_time = bpf_ktime_get_ns(); -+ s->type = args->type; -+ s->line_no = args->line_no; -+ s->tid = args->tid; -+ return 0; -+} -+ -+int trace_jbd2_journal_stop(struct pt_regs *ctx, handle_t *handle) -+{ -+ transaction_t *transaction = handle->h_transaction; -+ journal_t *journal; -+ struct handle_stat *s; -+ struct handle_stat s2 = {}; -+ struct task_struct *t = (struct task_struct *)bpf_get_current_task(); -+ unsigned long end_time = bpf_ktime_get_ns(); -+ unsigned long run_time; -+ -+ if (transaction == NULL) { -+ handle_stat_map.delete(&t); -+ return 0; -+ } -+ -+ journal = transaction->t_journal; -+ dev_t dev = journal->j_fs_dev->bd_dev; -+ if (FILTER_DEV) -+ return 0; -+ -+ if (handle->h_ref >= 2) -+ return 0; -+ -+ s = handle_stat_map.lookup(&t); -+ if (s == NULL) -+ return 0; -+ -+ /* -+ * This could happen, jbd2_journal_start_reserved() miss a trace -+ * jbd2_handle_start(). -+ */ -+ if (!s->start_time) -+ s->start_time = s->pre_start_time; -+ -+ -+ run_time = end_time - s->pre_start_time; -+ if (run_time < FILTER_THRESH_TIME * 1000 * 1000LLU) { -+ handle_stat_map.delete(&t); -+ return 0; -+ } -+ -+ s2.dev = s->dev; -+ s2.type = s->type; -+ s2.line_no = s->line_no; -+ s2.tid = s->tid; -+ s2.pre_start_time = s->pre_start_time; -+ s2.start_time = s->start_time; -+ s2.end_time = end_time; -+ s2.pid = bpf_get_current_pid_tgid(); -+ s2.sched_delay = t->sched_info.run_delay - s->sched_delay; -+ bpf_get_current_comm(&s2.comm, sizeof(s2.comm)); -+ -+ s2.kernel_stack_id = stack_traces.get_stackid(ctx, 0); -+ if (s2.kernel_stack_id >= 0) { -+ // populate extras to fix the kernel stack -+ u64 ip = PT_REGS_IP(ctx); -+ u64 page_offset; -+ -+ // if ip isn't sane, leave key ips as zero for later checking -+ #if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) -+ // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it -+ page_offset = __PAGE_OFFSET_BASE; -+ #elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) -+ // x64, 4.17, and later -+ #if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) -+ page_offset = __PAGE_OFFSET_BASE_L5; -+ #else -+ page_offset = __PAGE_OFFSET_BASE_L4; -+ #endif -+ #else -+ // earlier x86_64 kernels, e.g., 4.6, comes here -+ // arm64, s390, powerpc, x86_32 -+ page_offset = PAGE_OFFSET; -+ #endif -+ -+ if (ip > page_offset) -+ s2.kernel_ip = ip; -+ } -+ -+ jbd2_handle_stat.perf_submit(ctx, &s2, sizeof(struct handle_stat)); -+ handle_stat_map.delete(&t); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(jbd2, jbd2_run_stats) -+{ -+ struct transaction_run_stat s; -+ dev_t dev = args->dev;; -+ -+ if (FILTER_DEV) -+ return 0; -+ -+ memset(&s, 0, sizeof(struct transaction_run_stat)); -+ s.dev = args->dev; -+ s.tid = args->tid; -+ s.wait = args->wait; -+ s.request_delay = args->request_delay; -+ s.running = args->running; -+ s.locked = args->locked; -+ s.flushing = args->flushing; -+ s.logging = args->logging; -+ s.handle_count = args->handle_count; -+ s.blocks = args->blocks; -+ s.blocks_logged = args->blocks_logged; -+ -+ transaction_stat.perf_submit(args, &s, sizeof(struct transaction_run_stat)); -+ return 0; -+} -+ -+""" -+devid2name={} -+num_trans=0 -+ -+def init_dev_name(): -+ global devid2name -+ -+ f = open("/proc/partitions") -+ for line in f.readlines(): -+ line = line.strip() -+ if not len(line): -+ continue; -+ -+ str = line.split() -+ if str[0].isalpha(): -+ continue -+ dev_num = (int(str[0]) << 20) + int(str[1]) -+ devid2name[dev_num] = str[3] -+ f.close() -+ -+init_dev_name() -+ -+name2devid={v:k for k,v in devid2name.items()} -+ -+if args.device: -+ bpf_text = bpf_text.replace('FILTER_DEV', 'dev != %u' % name2devid[args.device]) -+else: -+ bpf_text = bpf_text.replace('FILTER_DEV', '0') -+ -+if args.thresh_time: -+ bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '%s' % args.thresh_time) -+else: -+ bpf_text = bpf_text.replace('FILTER_THRESH_TIME', '9999') -+ -+# code substitutions -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# load BPF program -+b = BPF(text=bpf_text) -+ -+b.attach_kprobe(event="start_this_handle", fn_name="trace_start_this_handle") -+b.attach_kprobe(event="jbd2_journal_stop", fn_name="trace_jbd2_journal_stop") -+ -+print("Tracing jbd2 stats... Hit Ctrl-C to end.") -+ -+TASK_COMM_LEN = 16 -+ -+class HandleStat(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("type", ct.c_uint), -+ ("line_no", ct.c_uint), -+ ("tid", ct.c_ulonglong), -+ ("pre_start_time", ct.c_ulonglong), -+ ("start_time", ct.c_ulonglong), -+ ("end_time", ct.c_ulonglong), -+ ("sched_delay", ct.c_ulonglong), -+ ("kernel_ip", ct.c_ulonglong), -+ ("kernel_stack_id", ct.c_int), -+ ("pid", ct.c_uint), -+ ("comm", ct.c_char * TASK_COMM_LEN), -+ ] -+ -+class Data(ct.Structure): -+ _fields_ = [ -+ ("dev", ct.c_uint), -+ ("handle_count", ct.c_uint), -+ ("blocks", ct.c_uint), -+ ("blocks_logged", ct.c_uint), -+ ("tid", ct.c_ulonglong), -+ ("wait", ct.c_ulonglong), -+ ("request_delay", ct.c_ulonglong), -+ ("running", ct.c_ulonglong), -+ ("locked", ct.c_ulonglong), -+ ("flushing", ct.c_ulonglong), -+ ("logging", ct.c_ulonglong), -+ ] -+ -+class trans_stat: -+ def __init__(self, dev): -+ self.dev=dev -+ self.handle_count=0 -+ self.blocks=0 -+ self.blocks_logged=0 -+ self.tid=0 -+ slef.wait=0 -+ self.request_delay=0 -+ self.running=0 -+ self.locked=0 -+ self.flushing=0 -+ self.logging=0 -+ -+ -+def stack_id_err(stack_id): -+ # -EFAULT in get_stackid normally means the stack-trace is not available, -+ # Such as getting kernel stack trace in userspace code -+ return (stack_id < 0) and (stack_id != -errno.EFAULT) -+ -+def aksym(addr): -+ return b.ksym(addr) + "_[k]".encode() -+ -+def print_stack(cpu, data, size): -+ k = ct.cast(data, ct.POINTER(HandleStat)).contents -+ -+ kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id) -+ -+ # fix kernel stack -+ kernel_stack = [] -+ if k.kernel_stack_id >= 0: -+ for addr in kernel_tmp: -+ kernel_stack.append(addr) -+ # the later IP checking -+ if k.kernel_ip: -+ kernel_stack.insert(0, k.kernel_ip) -+ -+ # print default multi-line stack output -+ if stack_id_err(k.kernel_stack_id): -+ print(" [Missed Kernel Stack]") -+ else: -+ print("comm: %s pid: %u tid: %lu pre_start_time: %lu run_time: %lu sched_delay: %lu" % -+ (k.comm, k.pid, k.tid, (k.start_time - k.pre_start_time) / 1000, -+ (k.end_time - k.pre_start_time) / 1000, k.sched_delay / 1000 / 1000)) -+ for addr in kernel_stack: -+ print(" %s" % aksym(addr)) -+ #print(" %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid)) -+ #print() -+ -+def print_header(): -+ print("\n%-8s %-12s %-4s %-13s %-7s %-6s %-8s %-7s %-12s %-6s %-13s" % ("dev", -+ "tid", "wait", "request_delay", "running", "locked", "flushing", "logging", -+ "handle_count", "blocks", "blocks_logged")) -+ -+# process event -+def print_event(cpu, data, size): -+ global num_trans -+ -+ event = ct.cast(data, ct.POINTER(Data)).contents -+ if (num_trans % 10) == 0: -+ print_header() -+ -+ print("%-8s %-12lu %-4lu %-13lu %-7lu %-6lu %-8lu %-7lu %-12lu %-6lu %-13lu" % -+ (devid2name[event.dev], event.tid, event.wait, event.request_delay, event.running, -+ event.locked, event.flushing, event.logging, event.handle_count, event.blocks, -+ event.blocks_logged)) -+ num_trans = num_trans + 1 -+ -+label="usecs" -+stack_traces = b["stack_traces"] -+b["jbd2_handle_stat"].open_perf_buffer(print_stack, page_cnt=64) -+b["transaction_stat"].open_perf_buffer(print_event, page_cnt=64) -+while 1: -+ try: -+ b.perf_buffer_poll() -+ -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alijbd2stat_example.txt b/tools/alijbd2stat_example.txt -new file mode 100644 -index 0000000..a65f12e ---- /dev/null -+++ b/tools/alijbd2stat_example.txt -@@ -0,0 +1,83 @@ -+Demonstrations of alijbd2stat. -+Alijbd2stat can trace every jbd2 handle and every transaction commit -+info. If one jbd2 handle's running time is greater than user specified -+threshold time, alijbd2stat will catch this handle's context. -+ -+Usage: -+alijbd2stat [-h] [-d DEVICE] [-t [THRESH_TIME]] -+ -+Summarize filesystem jbd2 stat -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -d DEVICE, --device DEVICE -+ inspect specified device -+ -t [THRESH_TIME], --thresh_time [THRESH_TIME] -+ show jbd2 handle's context when its run time is -+ greater than this value -+ -+examples: -+ ./alijbd2stat # show jbd2 transaction commit info. -+ ./alijbd2stat -d sda3 # inspect specified device /dev/sda3. -+ ./alijbd2stat -t 10 # show jbd2 handle's context when it runs more than 10ms. -+ -+ -+Output example: -+ -+Example 1: -+# Show jbd2 transaction commit info. -+python alijbd2stat.py -+ -+dev tid wait request_delay running locked flushing logging handle_count blocks blocks_logged -+sda5 11122351 0 0 5052 0 0 0 61 34 13 -+sda3 13641140 0 0 5280 0 0 0 138 19 20 -+sda5 11122352 0 0 5041 0 0 0 59 35 14 -+sda3 13641141 0 0 5000 0 0 0 113 9 10 -+sda5 11122353 0 0 5069 0 0 0 91 47 18 -+ -+Example 2: -+# inspect specified device /dev/sda5 -+python alijbd2stat.py -d sda5 -+ -+dev tid wait request_delay running locked flushing logging handle_count blocks blocks_logged -+sda5 11122461 0 0 5029 0 0 0 75 38 15 -+sda5 11122462 0 0 5107 0 0 0 74 42 17 -+sda5 11122463 0 0 5028 0 0 1 88 50 17 -+sda5 11122464 0 0 5128 0 0 0 71 40 19 -+sda5 11122465 0 0 5243 0 0 0 83 44 13 -+sda5 11122466 0 0 5176 0 0 0 90 63 20 -+sda5 11122467 0 0 5029 0 0 0 60 34 13 -+sda5 11122468 0 0 5028 0 0 1 79 38 19 -+sda5 11122469 0 0 5030 0 0 0 44 27 12 -+sda5 11122470 0 0 5614 0 0 0 106 62 25 -+ -+Example 3: -+# inspect specified device /dev/sda5, and show jbd2 handle's context when one -+# jbd2 handle runs more than 1ms. -+python alijbd2stat.py -d sda5 -t 1 -+ -+comm: dd pid: 106212 tid: 11151230 pre_start_time: 0 run_time: 585432 sched_delay: 0 -+ jbd2_journal_stop_[k] -+ jbd2_journal_stop_[k] -+ ext4_dirty_inode_[k] -+ __mark_inode_dirty_[k] -+ generic_write_end_[k] -+ ext4_da_write_end_[k] -+ generic_perform_write_[k] -+ __generic_file_write_iter_[k] -+ ext4_file_write_iter_[k] -+ __vfs_write_[k] -+ vfs_write_[k] -+ sys_write_[k] -+ do_syscall_64_[k] -+ entry_SYSCALL_64_after_swapgs_[k] -+comm: kworker/u128:3 pid: 83229 tid: 0 pre_start_time: 0 run_time: 589686 sched_delay: 0 -+ jbd2_journal_stop_[k] -+ jbd2_journal_stop_[k] -+ ext4_convert_unwritten_extents_[k] -+ ext4_end_io_rsv_work_[k] -+ process_one_work_[k] -+ worker_thread_[k] -+ kthread_[k] -+ ret_from_fork_[k] -+sda5 11151229 0 0 5000 0 0 595 401233 255 36 -diff --git a/tools/alimutexsnoop.py b/tools/alimutexsnoop.py -new file mode 100755 -index 0000000..d28ce05 ---- /dev/null -+++ b/tools/alimutexsnoop.py -@@ -0,0 +1,273 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+from __future__ import print_function -+ -+import argparse -+import ctypes as ct -+import os -+import platform -+import re -+import sys -+import time -+ -+from bcc import BPF -+from bcc.utils import printb -+from datetime import datetime -+from time import strftime -+ -+# -+# alimutexsnoop.py Trace all process that holds a mutex longer than a -+# specified time interval (500us, etc). -+# For Linux, uses BCC, eBPF. Embedded C. -+# -+# USAGE: alimutexsnoop [-h] [-t] [-p PID] -+# -+# Author: Alvin Zheng -+_examples = """examples: -+ alimutexsnoop # trace all mutexes that are held for a long time -+ alimutexsnoop -t 50 # set the time threshold (us), mutexes are held longer -+ than this time threshold will be printed -+ alimutexsnoop -p 123 # only trace the specified process -+""" -+""" -+ Copyright (c) 2019 Alvin Zheng, Alibaba, Inc. -+ Licensed under the Apache License, Version 2.0 (the "License") -+""" -+ -+def positive_int(val): -+ try: -+ ival = int(val) -+ except ValueError: -+ raise argparse.ArgumentTypeError("must be an integer") -+ if ival <= 0: -+ raise argparse.ArgumentTypeError("must be positive value") -+ return ival -+ -+def _getParser(): -+ parser = argparse.ArgumentParser( -+ description="Trace all mutexes that have been held for a long time", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=_examples) -+ a = parser.add_argument -+ a("-t", "--threshold", type=positive_int, default=1000, -+ help="The mutexes held longer than this time threshold will be traced(us)") -+ a("-p", "--pid", type=positive_int, -+ help="trace this PID only") -+ return parser.parse_args -+ -+ -+class Global(): -+ parse_args = _getParser() -+ args = None -+ argv = None -+ bpf = None -+ -+class Data(ct.Structure): -+ """Event data matching struct data_t in _embedded_c().""" -+ _TASK_COMM_LEN = 16 # linux/sched.h -+ _pack_ = 1 -+ _fields_ = [ -+ ("start_time", ct.c_ulonglong), # task->start_time -+ ("pid_tgid", ct.c_ulonglong), -+ ("pid", ct.c_uint), # task->tgid, thread group id == sys_getpid() -+ ("mutex_addr", ct.c_ulonglong), -+ ("k_stack_id", ct.c_int), -+ ("u_stack_id", ct.c_int), -+ ("task", ct.c_char * _TASK_COMM_LEN) -+ ] -+ -+def _embedded_c(args): -+ """Generate C programs for mutex_lock and mutex_unlock.""" -+ c = """ -+ EBPF_COMMENT -+ #include -+ BPF_STATIC_ASSERT_DEF -+ BPF_STACK_TRACE(stack_traces,655360); -+ struct key_t{ -+ u64 pid_tgid; -+ u64 mutex_addr; -+ } __attribute__((packed)); -+ BPF_HASH(held_mutexes, struct key_t,u64); -+ struct data_t{ -+ u64 start_time; -+ u64 pid_tgid; -+ u32 pid; -+ u64 mutex_addr; -+ u32 k_stack_id; -+ u32 u_stack_id; -+ char task[TASK_COMM_LEN]; -+ } __attribute__((packed)); -+ BPF_STATIC_ASSERT(sizeof(struct data_t) == CTYPES_SIZEOF_DATA); -+ BPF_PERF_OUTPUT(results); -+ -+ int kprobe_ret_mutex_lock(struct pt_regs * ctx,struct mutex * lock) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ FILTER_SELF_LOCK -+ if (FILTER_PID) { return 0; } -+ struct key_t key = { -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ u64 locktime = bpf_ktime_get_ns(); -+ u64 * ptr=held_mutexes.lookup_or_init(&key, &locktime); -+ if(!ptr){ -+ bpf_trace_printk("could not add held_mutexes, thread: %d, mutex: %p \\n" ,task->tgid,lock); -+ return 1; -+ } -+ return 0; -+ } -+ int kprobe_mutex_unlock(struct pt_regs * ctx, struct mutex * lock) -+ { -+ struct task_struct *task = (typeof(task))bpf_get_current_task(); -+ FILTER_SELF_UNLOCK -+ if (FILTER_PID) { return 0; } -+ struct key_t key = { -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ u64 cur = bpf_ktime_get_ns(); -+ u64 * locktime=held_mutexes.lookup(&key); -+ if(!locktime){ -+ bpf_trace_printk("cannot find the lock in thread: %d mutex: %p \\n",task->tgid,lock); -+ return 1; -+ } -+ if(cur - (*locktime) > TIME_THRESHOLD){ -+ struct data_t data = { -+ .start_time=task->start_time, -+ .pid_tgid=bpf_get_current_pid_tgid(), -+ .pid=task->pid, -+ .mutex_addr=(u64)((void*)lock) -+ }; -+ bpf_get_current_comm(&data.task,sizeof(data.task)); -+ data.k_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID); -+ data.u_stack_id=stack_traces.get_stackid(ctx,BPF_F_REUSE_STACKID|BPF_F_USER_STACK); -+ results.perf_submit(ctx,&data,sizeof(data)); -+ } -+ held_mutexes.delete(&key); -+ return 0; -+ } -+ """ -+ # TODO: this macro belongs in bcc/src/cc/export/helpers.h -+ bpf_static_assert_def = r""" -+ #ifndef BPF_STATIC_ASSERT -+ #define BPF_STATIC_ASSERT(condition) __attribute__((unused)) \ -+ extern int bpf_static_assert[(condition) ? 1 : -1] -+ #endif -+ """ -+ selfpid = os.getpid() -+ filter_self = r""" -+ if(task->tgid==%d){ -+ return 0; -+ } -+ """ % selfpid -+ code_substitutions = [ -+ ('EBPF_COMMENT', _ebpf_comment()), -+ ("BPF_STATIC_ASSERT_DEF", bpf_static_assert_def), -+ ("CTYPES_SIZEOF_DATA", str(ct.sizeof(Data))), -+ ("TIME_THRESHOLD", str(Global.args.threshold * 1000)), # ns->us -+ ("FILTER_SELF_LOCK", filter_self), -+ ("FILTER_SELF_UNLOCK", filter_self), -+ ('FILTER_PID', '0' if not Global.args.pid else "task->pid != %d" % Global.args.pid) -+ ] -+ for old, new in code_substitutions: -+ c = c.replace(old, new) -+ # print(c) -+ return c -+ -+def _ebpf_comment(): -+ """Return a C-style comment with information about the generated code.""" -+ comment = ('Created by %s at %s:\n\t%s' % -+ (sys.argv[0], strftime("%Y-%m-%d %H:%M:%S %Z"), _embedded_c.__doc__)) -+ args = str(vars(Global.args)).replace('{', '{\n\t').replace(', ', ',\n\t').replace('}', ',\n }\n\n') -+ return ("\n /*" + ("\n %s\n\n ARGV = %s\n\n ARGS = %s/" % -+ (comment, ' '.join(Global.argv), args)) -+ .replace('\n', '\n\t*').replace('\t', ' ')) -+ -+def _print_header(): -+ print("%-16s %-6s %-16s \n" % ("PCOMM", "PID", "MUTEX_ADDR")) -+ -+def _print_event(cpu, data, size): # callback -+ """Print the long-held mutexes.""" -+ stack_traces = Global.bpf['stack_traces'] -+ e = ct.cast(data, ct.POINTER(Data)).contents -+ print("%-16s %-6d %-16x\n" % -+ (e.task.decode(), e.pid, e.mutex_addr), end="") -+ # print the kernel stack -+ if e.k_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.k_stack_id))) -+ for addr in cur_stack: -+ printb(b"Kernel Stack: %-16x %s" % (addr, Global.bpf.ksym(addr))) -+ else: -+ print("No KERNEL STACK FOUND") -+ # print the user stack -+ if e.u_stack_id >= 0: -+ cur_stack = reversed(list(stack_traces.walk(e.u_stack_id))) -+ for addr in cur_stack: -+ printb(b"USER Stack: %-16x %s" % (addr, Global.bpf.sym(addr, e.pid_tgid))) -+ else: -+ print("No USER STACK FOUND") -+ print("=================================") -+ print() -+# ============================= -+# Module: These functions are available for import -+# ============================= -+def initialize(arg_list=sys.argv[1:]): -+ """Trace all long-held mutexes -+ -+ arg_list - list of args, if omitted then uses command line args -+ arg_list is passed to argparse.ArgumentParser.parse_args() -+ -+ Returns a tuple (return_code, result) -+ 0 = Ok, result is the return value from BPF() -+ 1 = args.ebpf is requested, result is the generated C code -+ os.EX_NOPERM: need CAP_SYS_ADMIN, result is error message -+ os.EX_SOFTWARE: internal software error, result is error message -+ """ -+ Global.argv = arg_list -+ Global.args = Global.parse_args(arg_list) -+ if os.geteuid() != 0: -+ return (os.EX_NOPERM, "Need sudo (CAP_SYS_ADMIN) for BPF() system call") -+ c = _embedded_c(Global.args) -+ try: -+ return (os.EX_OK, BPF(text=c)) -+ except Exception as e: -+ error = format(e) -+ return (os.EX_SOFTWARE, "BPF error: " + error) -+ -+def snoop(bpf, event_handler): -+ """Call event_handler when release a lock that is held for a long time -+ bpf - result returned by successful initialize() -+ event_handler - callback function to handle termination event -+ args.pid - Return after event_handler is called, only monitoring this pid -+ """ -+ bpf["results"].open_perf_buffer(event_handler) -+ while True: -+ bpf.perf_buffer_poll() -+ if Global.args.pid: -+ if not os.path.exists('/proc/%d' % Global.args.pid): -+ # the target process has exited -+ break -+ -+# ============================= -+# Script: invoked as a script -+# ============================= -+def main(): -+ try: -+ rc, buffer = initialize() -+ if rc != os.EX_OK: -+ print(buffer) -+ sys.exit(-1) -+ Global.bpf = buffer -+ Global.bpf.attach_kretprobe(event="mutex_lock", fn_name="kprobe_ret_mutex_lock") -+ Global.bpf.attach_kprobe(event="mutex_unlock", fn_name="kprobe_mutex_unlock") -+ _print_header() -+ snoop(buffer, _print_event) -+ except KeyboardInterrupt: -+ print() -+ sys.exit() -+ -+ return 0 -+ -+if __name__ == '__main__': -+ main() -diff --git a/tools/alimutexsnoop_example.txt b/tools/alimutexsnoop_example.txt -new file mode 100644 -index 0000000..67ca03e ---- /dev/null -+++ b/tools/alimutexsnoop_example.txt -@@ -0,0 +1,79 @@ -+Demonstrations of alimutexsnoop. -+ -+This profile tool monitors the mutex that is held by the process longer than -+a specified time threshold, such as 100us. Once alimutexsnoop finds such mutexes, -+it prints the pid of the process that held the mutex and the address of the -+mutex. In addition, alimutexsnoop also prints the user and kernel stack of the -+process. -+ -+ -+Usage: -+alimutexsnoop.py [-h] [-t THRESHOLD] [-p PID] -+ -+Trace all mutexes that have been held for a long time -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -t THRESHOLD, --threshold THRESHOLD -+ The mutexes held longer than this time threshold will -+ be traced(us) -+ -p PID, --pid PID trace this PID only -+ -+examples: -+ alimutexsnoop # trace all mutexes that are held for a long time -+ alimutexsnoop -t 50 # set the time threshold (us), mutexes are held longer -+ than this time threshold will be printed -+ alimutexsnoop -p 123 # only trace the specified process -+ -+ -+Output example: -+ -+python alimutexsnoop.py -t 20 # monitor the mutexes that are held by processes longer than 20 us -+ -+PCOMM PID MUTEX_ADDR -+ -+AliYunDun 4283 ffffa30fd1b4ea48 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDun 4283 ffffa310d7e43c40 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316ba ksys_read -+Kernel Stack: ffffffffa5231149 vfs_read -+Kernel Stack: ffffffffa5230f86 __vfs_read -+Kernel Stack: ffffffffa5254b30 seq_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDun 4283 ffffa30fd1b4e348 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7f1532c4e6fd read -+================================= -+ -+AliYunDunUpdate 4256 ffffa310d1146a40 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316ba ksys_read -+Kernel Stack: ffffffffa5231149 vfs_read -+Kernel Stack: ffffffffa5230f86 __vfs_read -+Kernel Stack: ffffffffa5254b30 seq_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7fbce69966fd read -+================================= -+ -+AliYunDunUpdate 4256 ffffa310d3b48a48 -+Kernel Stack: ffffffffa5800088 entry_SYSCALL_64_after_hwframe -+Kernel Stack: ffffffffa500201b do_syscall_64 -+Kernel Stack: ffffffffa52316fc ksys_read -+Kernel Stack: ffffffffa57cfd21 mutex_unlock -+USER Stack: 7fbce69966fd read -+================================= -diff --git a/tools/alirunqinfo.py b/tools/alirunqinfo.py -new file mode 100755 -index 0000000..102c763 ---- /dev/null -+++ b/tools/alirunqinfo.py -@@ -0,0 +1,360 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alirunqinfo Summarize task on-CPU time before being descheduled. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alirunqinfo [-h] [-T] [-N] [-D] [-C] [-P] [-c CPU] [-p PID] -+# [interval] [outputs] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+import ctypes -+import signal -+ -+# arguments -+examples = """examples: -+ ./alirunqinfo # sum runqueue statistics -+ ./alirunqinfo -C # show per-CPU runqueue statistics -+ ./alirunqinfo -P # show per-thread runqueue statistics -+ ./alirunqinfo -D # show runqueue statistics as histograms -+ ./alirunqinfo -c 0 # show runqueue statistics of CPU 0 only -+ ./alirunqinfo -p 25 # show runqueue statistics of PID 25 only -+ ./alirunqinfo 1 10 # print 1 second summaries, 10 times -+ ./alirunqinfo -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize runqueue statistics.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-C", "--cpus", action="store_true", -+ help="show per-CPU runqueue statistics") -+parser.add_argument("-P", "--pids", action="store_true", -+ help="show per-thread runqueue statistics") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="show runqueue statistics on specific CPU only") -+parser.add_argument("-p", "--pid", -+ help="show runqueue statistics of specific PID only") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.pid: -+ args.pids = True -+if args.cpu: -+ args.cpus = True -+if (not args.cpus) and (not args.pids): -+ args.cpus = True -+ -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct time_stamp { -+ u64 stamp; -+ u32 pid; -+} time_stamp_t; -+ -+BPF_PERCPU_ARRAY(start, time_stamp_t, 1); -+ -+STORAGE -+ -+ -+static inline int sched_switch_exit(u32 cpu, u32 pid, u64 now, time_stamp_t *new) { -+ PID_FILTER -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ time_stamp_t *stampp = start.lookup(&idx); -+ if (!stampp || stampp->pid != pid) { -+ return 0; // missed start -+ } -+ u64 delta = now - stampp->stamp; -+ start.update(&idx, new); -+ -+ UPDATE -+ -+ return 0; -+} -+ -+static inline int sched_switch_enter(u32 pid, u64 now, time_stamp_t *new) { -+ PID_FILTER -+ -+ u32 idx = 0; -+ new->stamp = now; -+ new->pid = pid; -+ -+ start.update(&idx, new); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(sched, sched_switch) -+{ -+ u32 cpu = bpf_get_smp_processor_id(); -+ CPU_FILTER -+ -+ u64 now = bpf_ktime_get_ns(); -+ u32 pid, idx = 0; -+ time_stamp_t new = {}; -+ -+ pid = args->prev_pid; -+ if (pid != 0) { -+ sched_switch_exit(cpu, pid, now, &new); -+ } -+ -+ pid = args->next_pid; -+ if (pid != 0) { -+ sched_switch_enter(pid, now, &new); -+ } -+ -+ return 0; -+} -+""" -+ -+ -+# replace STORAGE -+map_key = """ -+typedef struct map_key { -+ u32 cpu; -+ u32 pid; -+} map_key_t; -+""" -+ -+map_val = """ -+typedef struct map_val { -+ u64 time; -+ u64 count; -+} map_val_t; -+""" -+ -+if args.dist: -+ # diagram mode, we need BPF_HISTOGRAM -+ if args.cpus and args.pids: -+ storage = map_key + """ -+typedef struct map_key_slot { -+ map_key_t key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ else: -+ storage = """ -+typedef struct map_key_slot { -+ u64 key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ storage += """BPF_HISTOGRAM(dist, map_key_slot_t);""" -+ -+else: -+ # normal mode, we need BPF_HASH -+ storage = map_val -+ if args.cpus and args.pids: -+ storage += map_key + """BPF_HASH(res, map_key_t, map_val_t);""" -+ else: -+ storage += """BPF_HASH(res, u32, map_val_t);""" -+ -+bpf_text = bpf_text.replace('STORAGE', storage) -+ -+ -+# replace UPDATE -+if args.dist: -+ if args.cpus and args.pids: -+ key_exp = "{.cpu = cpu, .pid = pid,}" -+ elif args.cpus: -+ key_exp = "cpu" -+ elif args.pids: -+ key_exp = "pid" -+ -+ update = """ -+ map_key_slot_t key_slot = { -+ .key = %s, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+""" % key_exp -+ -+else: -+ if args.cpus and args.pids: -+ update = """ -+ map_key_t key = { -+ .cpu = cpu, -+ .pid = pid, -+ }; -+""" -+ elif args.cpus: -+ update = "u32 key = cpu;" -+ elif args.pids: -+ update = "u32 key = pid;" -+ -+ update += """ -+ map_val_t *valp = res.lookup(&key); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ } -+ else { -+ map_val_t val = { -+ .count = 1, -+ .time = delta, -+ }; -+ res.update(&key, &val); -+ } -+""" -+bpf_text = bpf_text.replace('UPDATE', update) -+bpf_text = bpf_text.replace('FACTOR', str(factor)) -+ -+ -+# replace FILTER -+cpu_filter = "if (cpu != %s) {return 0;}" % args.cpu if args.cpu else "" -+bpf_text = bpf_text.replace('CPU_FILTER', cpu_filter) -+ -+pid_filter = "if (pid != %s) {return 0;}" % args.pid if args.pid else "" -+bpf_text = bpf_text.replace('PID_FILTER', pid_filter) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+ -+# print header -+print("Tracing runqueue statistics", end="") -+if args.pid: -+ print(" of PID %s" % args.pid, end="") -+if args.cpu: -+ print(" on CPU %s" % args.cpu, end="") -+if args.dist: -+ print(", in diagram mode", end="") -+print("...Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+def pid_to_comm(pid): -+ try: -+ return open("/proc/%d/comm" % pid, "r").read().rstrip("\n") -+ except IOError: -+ return "" -+ -+ -+def print_per_cpu_thread(tab): -+ cur_cpu = -1 -+ # k is struct map_key; v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].pid)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label)) -+ cur_cpu = k.cpu -+ print("%10d %40s %10d %10d" % (k.pid, pid_to_comm(k.pid), v.count, v.time / factor)) -+ -+ -+def print_per_cpu(tab): -+ print("%10s %10s %10s" % ("CPU", "Count", "TOTAL_" + label)) -+ # k is c_uint (representing CPU), v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: res[0].value): -+ print("%10d %10d %10d" % (k.value, v.count, v.time / factor)) -+ -+ -+def print_per_thread(tab): -+ print("%10s %40s %10s %10s" % ("PID", "Thread", "Count", "TOTAL_" + label)) -+ # k is c_uint (representing PID), v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: res[0].value): -+ print("%10d %40s %10d %10d" % (k.value, pid_to_comm(k.value), v.count, v.time / factor)) -+ -+ -+def print_dist_per_cpu_thread(tab): -+ tab.print_log2_hist("time_" + label, "section", -+ # k is (cpuN, pidN) formatted by bucket_fn -+ section_print_fn=lambda k: ("CPU %d, PID %d (%s)" % (k[0], k[1], pid_to_comm(k[1]))), -+ bucket_fn=lambda k: (k.cpu, k.pid), # k is struct map_key -+ bucket_sort_fn=lambda k: sorted(k)) # k is struct map_key -+ -+ -+def print_dist_per_cpu(tab): -+ tab.print_log2_hist("time_" + label, "CPU", -+ section_print_fn=lambda k: str(k), # k is u64 representing CPU -+ bucket_sort_fn=lambda k: sorted(k)) # k is u64 representing CPU -+ -+ -+def print_dist_per_thread(tab): -+ tab.print_log2_hist("time_" + label, "Thread", -+ section_print_fn=lambda k: ("%d (%s)" % (k, pid_to_comm(k))), # k is u64 representing PID -+ bucket_sort_fn=lambda k: sorted(k)) # k is u64 representing PID -+ -+ -+# signal handler -+def signal_ignore(signal, frame): -+ print() -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ # as cleanup can take many seconds, trap Ctrl-C: -+ signal.signal(signal.SIGINT, signal_ignore) -+ countdown = 1 -+ -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ if args.cpus and args.pids: -+ print_dist_per_cpu_thread(tab) -+ elif args.cpus: -+ print_dist_per_cpu(tab) -+ elif args.pids: -+ print_dist_per_thread(tab) -+ else: -+ print("Invalid parameter") -+ countdown = 1 -+ else: -+ if args.cpus and args.pids: -+ print_per_cpu_thread(tab) -+ elif args.cpus: -+ print_per_cpu(tab) -+ elif args.pids: -+ print_per_thread(tab) -+ else: -+ print("Invalid parameter") -+ countdown = 1 -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alirunqinfo_example.txt b/tools/alirunqinfo_example.txt -new file mode 100644 -index 0000000..a1c24ca ---- /dev/null -+++ b/tools/alirunqinfo_example.txt -@@ -0,0 +1,203 @@ -+This program summarize runqueue statistics, that is, task on-CPU time -+before being descheduled, in flavor of per-CPU, per-PID, or histogram. -+ -+There are major two categories of parameters: -+1. one category of parameters control the sampling source, such as -+-c[--cpu] and -p[--pid] -+2. another category of parameters control the displaying format, -+such as -C[--cpus], -P[--pids] and -D[--dist] -+ -+These two categories of parameters are Orthogonal. -+ -+ -+Example 1. -+ -+By default, the program will sample all processes on all runqueues -+of all CPUs. Statistics of all processes on the same CPU is -+aggregated and displayed in a per-CPU flavor. -+ -+In this case, "alirunqinfo" is the same as "alirunqinfo -C". -+ -+# alirunqinfo -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs -+ 0 77 207 -+ 1 9 27 -+ 2 3 14 -+ 3 2 15 -+ 4 45 122 -+ 6 41 380 -+ 7 6 25 -+ 9 18 61 -+ 10 19 50 -+ 12 2 19 -+ 14 2 189 -+ 16 1 13 -+ 17 18 49 -+ 20 36 73 -+ 21 18 35 -+ 22 22 109 -+ 24 25 92 -+ 28 21 68 -+ 30 4 95 -+ -+ -+Example 2. -+ -+Make the output displayed in a per-process flavor with "-P" parameter. -+In this case, all processes on all CPUs are sampled as usual, while -+all statistics of the same process is aggregated, whichever CPU it -+runs on. -+ -+# alirunqinfo -P -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 10 rcu_sched 122 264 -+ 382 kworker/14:1-mm_percpu_wq 1 20 -+ 2926 rngd 1 2 -+ 3327 kworker/16:2-mm_percpu_wq 1 15 -+ 3384 in:imjournal 2 8 -+ 3428 AliYunDunUpdate 20 60 -+ 3429 AliYunDunUpdate 20 40 -+ 3442 aliyun-service 20 98 -+ 3449 AliYunDun 20 137 -+ 3450 AliYunDun 19 58 -+ 3451 AliYunDun 20 56 -+ 3462 AliYunDun 19 46 -+ 3463 AliYunDun 19 100 -+ 3464 AliYunDun 2 220 -+ 3467 AliYunDun 2 6 -+ 3468 AliYunDun 26 61 -+ 3469 AliYunDun 39 78 -+ 3470 AliYunDun 39 125 -+ 3471 AliYunDun 2 10 -+ 3473 AliYunDun 2 4 -+ 3474 AliYunDun 1 4 -+ 3475 AliYunDun 2 11 -+ 3476 AliYunDun 10 30 -+ 3533 sshd 5 193 -+ 4573 kworker/27:2-mm_percpu_wq 2 13 -+ 5016 kworker/24:0-mm_percpu_wq 1 5 -+ 5281 kworker/4:0-events_power_efficient 9 19 -+ 5999 kworker/0:2-events_power_efficient 2 34 -+ 6429 kworker/u64:0-events_unbound 3 14 -+ 6468 kworker/u64:1-events_unbound 2 37 -+ -+ -+Example 3. -+ -+You can set "-P" and "-C" parameter at the same time, in which case, the -+output is displayed in a per-CPU and per-process flavor. -+ -+# alirunqinfo -PC -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ PID Thread Count TOTAL_usecs -+ 10 rcu_sched 72 147 -+ 3467 AliYunDun 1 6 -+ -+CPU1: -+ PID Thread Count TOTAL_usecs -+ 3476 AliYunDun 5 26 -+ -+CPU2: -+ PID Thread Count TOTAL_usecs -+ 3384 in:imjournal 2 12 -+ -+CPU3: -+ PID Thread Count TOTAL_usecs -+ 2926 rngd 2 7 -+ -+ -+Example 4. -+ -+Sample specified process only with "-p", and "-P" is added implicitly -+when "-p" is specified. -+ -+# alirunqinfo -p 3429 -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3429 AliYunDunUpdate 14 40 -+ -+ -+Example 5. -+ -+When both "-p" and "-C" parameters are specified, the output is displayed -+in per-CPU and per-process flavor. -+ -+# alirunqinfo -p 3429 -C -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+CPU22: -+ PID Thread Count TOTAL_usecs -+ 3429 AliYunDunUpdate 29 67 -+ -+ -+Example 6. -+ -+Sample processes on specified CPU only with "-c", and "-C" is added -+implicitly when "-c" is specified. -+ -+# alirunqinfo -c 22 -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs -+ 22 24 90 -+ -+ -+Example 7. -+ -+Make the output displayed in a diagram when "-D" is specified. -+ -+# alirunqinfo -p 3429 -D -+Tracing runqueue statistics... Hit Ctrl-C to end. -+^C -+ -+Thread = 3429 (AliYunDunUpdate) -+ time_usecs : count distribution -+ 0 -> 1 : 22 |****************************************| -+ 2 -> 3 : 12 |********************* | -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 0 | | -+ 16 -> 31 : 0 | | -+ 32 -> 63 : 1 |* | -+ -+ -+ -+USAGE message: -+ -+# alirunqinfo -h -+usage: alirunqinfo.py [-h] [-T] [-N] [-C] [-P] [-D] [-c CPU] [-p PID] -+ [interval] [outputs] -+ -+Summarize runqueue statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -C, --cpus show per-CPU runqueue statistics -+ -P, --pids show per-thread runqueue statistics -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU show runqueue statistics on specific CPU only -+ -p PID, --pid PID show runqueue statistics of specific PID only -+ -+examples: -+ ./alirunqinfo # sum runqueue statistics -+ ./alirunqinfo -C # show per-CPU runqueue statistics -+ ./alirunqinfo -P # show per-thread runqueue statistics -+ ./alirunqinfo -D # show runqueue statistics as histograms -+ ./alirunqinfo -c 0 # show runqueue statistics of CPU 0 only -+ ./alirunqinfo -p 25 # show runqueue statistics of PID 25 only -+ ./alirunqinfo 1 10 # print 1 second summaries, 10 times -+ ./alirunqinfo -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisoftirqs.py b/tools/alisoftirqs.py -new file mode 100755 -index 0000000..202114f ---- /dev/null -+++ b/tools/alisoftirqs.py -@@ -0,0 +1,215 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisoftirqs Summarize soft IRQ (interrupt) event time. -+# Enhanced version of BCC softirqs. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisoftirqs [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alisoftirqs # sum soft irq event time -+ ./alisoftirqs -D # show soft irq event time as histograms -+ ./alisoftirqs -c 0 # show soft irq event time of CPU 0 only -+ ./alisoftirqs -i 25 # show event time of IRQ 25 only -+ ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alisoftirqs 1 10 # print 1 second summaries, 10 times -+ ./alisoftirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize soft irq statistics, including total count, time, etc.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-c", "--cpu", -+ help="output softIRQ statistics on specific CPU only") -+parser.add_argument("-i", "--irq", -+ help="output statistics of specific softIRQ") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+distmode = "1" if args.dist else "0" -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct irq_key { -+ u32 cpu; -+ u32 irq; -+} irq_key_t; -+ -+typedef struct irq_key_slot { -+ irq_key_t key; -+ u64 slot; -+} irq_key_slot_t; -+ -+typedef struct irq_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} irq_val_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_HASH(res, irq_key_t, irq_val_t); -+BPF_HISTOGRAM(dist, irq_key_slot_t); -+ -+ -+TRACEPOINT_PROBE(irq, softirq_entry) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(irq, softirq_exit) -+{ -+ u64 *tsp, delta; -+ u32 cpu = bpf_get_smp_processor_id(); -+ u32 irq = args->vec; -+ -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ irq_key_t key = {.cpu = cpu, .irq = irq}; -+ -+#if DIST_MODE -+ irq_key_slot_t key_slot = { -+ .key = key, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+#else -+ struct irq_val *valp, val; -+ valp = res.lookup(&key); -+ -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+#endif -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_filter = '' -+cpu_filter = "1" -+if args.cpu: -+ cpu_filter = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+irq_filter = "1" -+if args.irq: -+ irq_filter = "args->vec == %s" % args.irq -+ -+cond_filter = "if (!(%s && %s)) {return 0;}" % (cpu_filter, irq_filter) -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('FACTOR', "%d" % factor) -+bpf_text = bpf_text.replace('DIST_MODE', distmode) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+def vec_to_name(vec): -+ # copied from softirq_to_name() in kernel/softirq.c -+ # may need updates if new softirq handlers are added -+ return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", -+ "tasklet", "sched", "hrtimer", "rcu"][vec] -+ -+def print_section(key): -+ return "CPU %d, softIRQ %d (%s)" % (key[0], key[1], vec_to_name(key[1])) -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing soft irq event time... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ tab.print_log2_hist("time_" + label, "section", -+ section_print_fn=print_section, -+ bucket_fn=lambda k: (k.cpu, k.irq), -+ bucket_sort_fn=lambda keys: sorted(keys)) -+ else: -+ cur_cpu = -1 -+ for k, v in sorted(tab.items(), key=lambda res: (res[0].cpu, res[0].irq)): -+ if cur_cpu != k.cpu: -+ print("\nCPU%d:" % k.cpu) -+ print("%10s %20s %10s %10s %10s" % -+ ("SOFTIRQ", "IRQHandler", "Count", "TOTAL_" + label, "MAX_" + label)) -+ cur_cpu = k.cpu -+ -+ print("%10d %20s %10d %10d %10d" % -+ (k.irq, vec_to_name(k.irq), v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisoftirqs_example.txt b/tools/alisoftirqs_example.txt -new file mode 100644 -index 0000000..48e0023 ---- /dev/null -+++ b/tools/alisoftirqs_example.txt -@@ -0,0 +1,128 @@ -+Demonstrations of alisoftirqs, the enhanced version of BCC softirqs. -+ -+This program traces soft interrupts (irqs), and stores timing statistics -+in-kernel for efficiency. -+ -+ -+Example 1 -+ -+By default, all statistics of all softirq on all CPUs are sampled, -+and the output is displayed in a per-softirq and per-CPU flavor. -+ -+# alisoftirqs -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 7 sched 1 59 59 -+ -+CPU3: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 1 0 0 -+ 3 net_rx 6 29 9 -+ 7 sched 1 1 1 -+ 9 rcu 1 0 0 -+ -+CPU4: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 29 11 1 -+ 7 sched 11 26 4 -+ 9 rcu 19 22 5 -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-softirq flavor. -+ -+# alisoftirqs -c 6 -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU6: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 7 2 0 -+ 7 sched 2 9 8 -+ 9 rcu 5 10 5 -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified softirq only with "-i" option, -+and the output will be displayed in a per-CPU flavor. -+ -+# alisoftirqs -i 1 -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+CPU0: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 1 2 2 -+ -+CPU4: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 26 11 2 -+ -+CPU7: -+ SOFTIRQ IRQHandler Count TOTAL_usecs MAX_usecs -+ 1 timer 2 5 4 -+ -+ -+ -+Example 4 -+ -+The distribution of softirq run time can be printed as a histogram with "-D" -+option. The output will be displayed in a per-softirq and per-CPU flavor. -+ -+# alisoftirqs -D -+Tracing soft irq event time... Hit Ctrl-C to end. -+^C -+ -+section = CPU 3, softIRQ 3 (net_rx) -+ time_usecs : count distribution -+ 0 -> 1 : 0 | | -+ 2 -> 3 : 2 |****************************************| -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 1 |******************** | -+ -+section = CPU 4, softIRQ 1 (timer) -+ time_usecs : count distribution -+ 0 -> 1 : 13 |****************************************| -+ -+section = CPU 4, softIRQ 7 (sched) -+ time_usecs : count distribution -+ 0 -> 1 : 2 |****************************************| -+ 2 -> 3 : 1 |******************** | -+ 4 -> 7 : 2 |****************************************| -+ -+ -+ -+USAGE message: -+ -+# alisoftirqs -h -+usage: alisoftirqs.py [-h] [-T] [-N] [-D] [-c CPU] [-i IRQ] -+ [interval] [outputs] -+ -+Summarize soft irq statistics, including total count, time, etc. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -D, --dist show distributions as histograms -+ -c CPU, --cpu CPU output softIRQ statistics on specific CPU only -+ -i IRQ, --irq IRQ output statistics of specific softIRQ -+ -+examples: -+ ./alisoftirqs # sum soft irq event time -+ ./alisoftirqs -D # show soft irq event time as histograms -+ ./alisoftirqs -c 0 # show soft irq event time of CPU 0 only -+ ./alisoftirqs -i 25 # show event time of IRQ 25 only -+ ./alisoftirqs -c 0 -i 25 # show event time of IRQ 25 on CPU 0 only -+ ./alisoftirqs 1 10 # print 1 second summaries, 10 times -+ ./alisoftirqs -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisyscount.py b/tools/alisyscount.py -new file mode 100755 -index 0000000..921fac7 ---- /dev/null -+++ b/tools/alisyscount.py -@@ -0,0 +1,329 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisyscount Summarize syscall counts and latencies. -+# Enhanced version of BCC syscount. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisyscount [-h] [-T] [-N] [-D] [-S] [-P] [-s syscall] [-p pid] [interval] [outputs] -+# -+# Copyright (c) 2015 Brendan Gregg. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+import ctypes -+from bcc.syscall import syscall_name, syscalls -+ -+# arguments -+examples = """examples: -+ ./alisyscount # sum syscall counts and latencies. -+ ./alisyscount -S # show per-syscall statistics -+ ./alisyscount -P # show per-thread syscall statistics -+ ./alisyscount -D # show syscall statistics as histograms -+ ./alisyscount -s 0 # show syscall statistics of syscall 0 only -+ ./alisyscount -p 25 # show syscall statistics of PID 25 only -+ ./alisyscount 1 10 # print 1 second summaries, 10 times -+ ./alisyscount -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+parser = argparse.ArgumentParser( -+ description="Summarize syscall counts and latencies.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+parser.add_argument("-S", "--syscalls", action="store_true", -+ help="show per-syscall statistics") -+parser.add_argument("-P", "--pids", action="store_true", -+ help="show per-thread syscall statistics") -+parser.add_argument("-D", "--dist", action="store_true", -+ help="show distributions as histograms") -+parser.add_argument("-s", "--syscall", -+ help="show statistics of specific syscall only") -+parser.add_argument("-p", "--pid", -+ help="show syscall statistics of specific PID only") -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.pid: -+ args.pids = True -+if args.syscall: -+ args.syscalls = True -+if (not args.syscalls) and (not args.pids): -+ args.syscalls = True -+ -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+ -+STORAGE -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_enter) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_exit) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 *tsp, delta; -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ u32 sysnr = args->id; -+ u32 pid = bpf_get_current_pid_tgid(); -+ -+ UPDATE -+ -+ return 0; -+} -+""" -+ -+ -+# replace STORAGE -+map_key = """ -+typedef struct map_key { -+ u32 sysnr; -+ u32 pid; -+} map_key_t; -+""" -+ -+map_val = """ -+typedef struct map_val { -+ u64 time; -+ u64 count; -+} map_val_t; -+""" -+ -+if args.dist: -+ if args.syscalls and args.pids: -+ storage = map_key + """ -+typedef struct map_key_slot { -+ map_key_t key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ else: -+ storage = """ -+typedef struct map_key_slot { -+ u64 key; -+ u64 slot; -+} map_key_slot_t; -+""" -+ storage += "BPF_HISTOGRAM(dist, map_key_slot_t);" -+ -+else: -+ storage = map_val -+ if args.syscalls and args.pids: -+ storage += map_key + "BPF_HASH(res, map_key_t, map_val_t);" -+ else: -+ storage += "BPF_HASH(res, u32, map_val_t);" -+bpf_text = bpf_text.replace('STORAGE', storage) -+ -+ -+# replace UPDATE -+if args.dist: -+ if args.syscalls and args.pids: -+ key_exp = "{.sysnr = sysnr, .pid = pid,}" -+ elif args.syscalls: -+ key_exp = "sysnr" -+ elif args.pids: -+ key_exp = "pid" -+ -+ update = """ -+ map_key_slot_t key_slot = { -+ .key = %s, -+ .slot = bpf_log2l(delta / FACTOR), -+ }; -+ dist.increment(key_slot); -+""" % key_exp -+ -+else: -+ if args.syscalls and args.pids: -+ update = """ -+ map_key_t key = { -+ .sysnr = sysnr, -+ .pid = pid, -+ }; -+""" -+ elif args.syscalls: -+ update = "u32 key = sysnr;" -+ elif args.pids: -+ update = "u32 key = pid;" -+ -+ update += """ -+ map_val_t val = {}; -+ map_val_t *valp = res.lookup_or_init(&key, &val); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ } -+""" -+bpf_text = bpf_text.replace('UPDATE', update) -+bpf_text = bpf_text.replace('FACTOR', str(factor)) -+ -+ -+# replace FILTER -+sysnr_match = "args->id == %s" % args.syscall -+pid_match = "((u32)bpf_get_current_pid_tgid()) == %s" % args.pid -+ -+if args.syscall and args.pid: -+ cond_match = "(%s) && (%s)" % (sysnr_match, pid_match) -+elif args.syscall: -+ cond_match = sysnr_match -+elif args.pid: -+ cond_match = pid_match -+else: -+ cond_match = "" -+ -+cond_filter = "if (!(%s)) {return 0;}" % cond_match if cond_match else "" -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing syscall statistics... Hit Ctrl-C to end.") -+ -+if args.dist: -+ tab = b.get_table("dist") -+else: -+ tab = b.get_table("res") -+ -+ -+def pid_to_comm(pid): -+ try: -+ return open("/proc/%d/comm" % pid, "r").read().rstrip("\n") -+ except IOError: -+ return "" -+ -+ -+def get_pid_info(pid): -+ return "%10d %30s" % (pid, pid_to_comm(pid)) -+ -+def get_syscall_info(sysnr): -+ return "%10d %30s" % (sysnr, syscall_name(sysnr)) -+ -+def skip_errata(k): -+ if args.syscalls and args.pids: -+ sysnr = k.sysnr -+ else: # args.syscalls or args.pids -+ sysnr = k.value -+ -+ # happens occasionally, we don't need it -+ if sysnr == 0xFFFFFFFF: -+ return True -+ else: -+ return False -+ -+ -+def print_stat(tab): -+ if args.syscalls and args.pids: -+ section_hdr = "%10s %30s %10s %30s" % ("SysNR", "Syscall", "PID", "Thread") -+ elif args.pids: -+ section_hdr = "%10s %30s" % ("PID", "Thread") -+ else: # args.syscalls -+ section_hdr = "%10s %30s" % ("SysNR", "Syscall") -+ -+ section_hdr += " %10s %10s" % ("Count", "TOTAL_" + label) -+ print(section_hdr) -+ -+ # k is struct map_key; v is struct map_val -+ for k, v in sorted(tab.items(), key=lambda res: (-res[1].count, -res[1].time)): -+ if skip_errata(k): -+ continue -+ -+ if args.syscalls and args.pids: -+ record = "%40s %40s" % (get_syscall_info(k.sysnr), get_pid_info(k.pid)) -+ elif args.pids: -+ record = "%40s" % (get_pid_info(k.value)) -+ else: # args.syscalls -+ record = "%40s" % (get_syscall_info(k.value)) -+ -+ record += "%10d %10d" % (v.count, v.time / factor) -+ print(record) -+ -+ -+def get_dist_pid_info(pid): -+ return "PID %d (%s)" % (pid, pid_to_comm(pid)) -+ -+def get_dist_syscall_info(sysnr): -+ return "Syscall %d (%s)" % (sysnr, syscall_name(sysnr)) -+ -+def print_dist_section(k): -+ if args.syscalls and args.pids: -+ return "%s %s" % (get_dist_syscall_info(k[0]), get_dist_pid_info(k[1])) -+ elif args.pids: -+ return "%40s" % (get_dist_pid_info(k)) -+ else: -+ return "%40s" % (get_dist_syscall_info(k)) -+ -+def print_dist_stat(tab): -+ section = "section" if args.syscalls and args.pids else "Thread" if args.pids else "Syscall" -+ tab.print_log2_hist("time_" + label, section, -+ section_print_fn=print_dist_section, -+ bucket_fn=(lambda k: (k.sysnr, k.pid)) if args.syscalls and args.pids else None, -+ bucket_sort_fn=lambda k: sorted(k)) -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ if args.dist: -+ print_dist_stat(tab) -+ else: -+ print_stat(tab) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisyscount_example.txt b/tools/alisyscount_example.txt -new file mode 100644 -index 0000000..935f2ff ---- /dev/null -+++ b/tools/alisyscount_example.txt -@@ -0,0 +1,199 @@ -+This program summarize syscall counts and latencies, in flavor of -+per-CPU, per-PID, or histogram. -+ -+There are major two categories of parameters: -+1. one category of parameters control the sampling source, such as -+-s[--syscall] and -p[--pid] -+2. another category of parameters control the displaying format, -+such as -S[--syscalls], -P[--pids] and -D[--dist] -+ -+These two categories of parameters are Orthogonal. -+ -+ -+Example 1. -+ -+By default, the program will sample all syscalls of all processes. -+Statistics of all processes on the same syscall is aggregated and -+displayed in a per-syscall flavor. -+ -+In this case, "alisyscount" is the same as "alisyscount -S". -+ -+# alisyscount -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall Count TOTAL_usecs -+ 35 nanosleep 246 23815620 -+ 4 stat 207 306 -+ 0 read 61 65 -+ 321 bpf 43 112 -+ 3 close 28 23 -+ 14 rt_sigprocmask 20 5 -+ 232 epoll_wait 20 5 -+ 162 sync 19 1901216 -+ 228 clock_gettime 18 5 -+ 23 select 9 4016487 -+ 6 lstat 9 16 -+ 1 write 8 136 -+ 2 open 8 19 -+ 72 fcntl 6 1 -+ 5 fstat 4 2 -+ 16 ioctl 3 3 -+ 271 ppoll 1 900954 -+ 78 getdents 1 355 -+ 89 readlink 1 4 -+ 257 openat 1 2 -+ 9 mmap 1 2 -+ 100 times 1 2 -+ 13 rt_sigaction 1 0 -+ -+ -+Example 2. -+ -+Make the output displayed in a per-process flavor with "-P" parameter. -+In this case, all syscalls of all processe are sampled as usual, while -+all statistics of the same process is aggregated, whatever syscall it -+runs. -+ -+# alisyscount -P -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3550 AliYunDun 125 3103841 -+ 3529 AliYunDun 71 3101981 -+ 3549 AliYunDun 62 3103861 -+ 3543 AliYunDun 61 3001874 -+ 3493 aliyun-service 61 3001874 -+ 3564 sshd 48 3149751 -+ 3824 alisyscount.py 46 3149398 -+ 3548 AliYunDun 40 3002530 -+ 3544 AliYunDun 32 2000233 -+ 3530 AliYunDun 31 3101971 -+ 3556 AliYunDun 31 3001000 -+ 3542 AliYunDun 30 3001918 -+ 3531 AliYunDun 30 3001871 -+ 3423 AliYunDunUpdate 30 3001867 -+ 3424 AliYunDunUpdate 30 3001853 -+ 3475 in:imjournal 5 1801873 -+ 3551 AliYunDun 3 1000111 -+ 3422 AliYunDunUpdate 3 1 -+ 3553 AliYunDun 2 2000172 -+ 3547 AliYunDun 2 2000172 -+ 3555 AliYunDun 2 2000162 -+ -+ -+Example 3. -+ -+You can set "-P" and "-S" parameter at the same time, in which case, the -+output is displayed in a per-syscall and per-process flavor. -+ -+# alisyscount -P -S -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall PID Thread Count TOTAL_usecs -+ 4 stat 3550 AliYunDun 586 855 -+ 35 nanosleep 3549 AliYunDun 40 2002500 -+ 0 read 3550 AliYunDun 36 90 -+ 35 nanosleep 3548 AliYunDun 26 1951607 -+ 321 bpf 3832 alisyscount.py 23 36 -+ 35 nanosleep 3550 AliYunDun 22 1901390 -+ 35 nanosleep 3530 -+ -+ -+Example 4. -+ -+Sample specified process only with "-p", and "-P" is added implicitly -+when "-p" is specified. -+ -+# alisyscount -p 3550 -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ PID Thread Count TOTAL_usecs -+ 3550 AliYunDun 121 3003773 -+ -+ -+Example 5. -+ -+When both "-p" and "-S" parameters are specified, the output is displayed -+in per-syscall and per-process flavor. -+ -+# alisyscount -p 3550 -S -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall PID Thread Count TOTAL_usecs -+ 0 read 3550 AliYunDun 42 16 -+ 35 nanosleep 3550 AliYunDun 41 2052562 -+ -+ -+Example 6. -+ -+Sample specified syscall only with "-s", and "-S" is added -+implicitly when "-s" is specified. -+ -+# alisyscount -s 3 -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ SysNR Syscall Count TOTAL_usecs -+ 3 close 36 15 -+ -+ -+Example 7. -+ -+Make the output displayed in a diagram when "-D" is specified. -+ -+# alisyscount -p 3550 -D -+Tracing syscall statistics... Hit Ctrl-C to end. -+^C -+ -+Thread = PID 3550 (AliYunDun) -+ time_usecs : count distribution -+ 0 -> 1 : 33 |****************************************| -+ 2 -> 3 : 0 | | -+ 4 -> 7 : 0 | | -+ 8 -> 15 : 0 | | -+ 16 -> 31 : 0 | | -+ 32 -> 63 : 0 | | -+ 64 -> 127 : 0 | | -+ 128 -> 255 : 0 | | -+ 256 -> 511 : 0 | | -+ 512 -> 1023 : 0 | | -+ 1024 -> 2047 : 0 | | -+ 2048 -> 4095 : 0 | | -+ 4096 -> 8191 : 0 | | -+ 8192 -> 16383 : 0 | | -+ 16384 -> 32767 : 0 | | -+ 32768 -> 65535 : 32 |************************************** | -+ -+ -+ -+USAGE message: -+ -+# alisyscount -h -+usage: alisyscount.py [-h] [-T] [-N] [-S] [-P] [-D] [-s SYSCALL] [-p PID] -+ [interval] [outputs] -+ -+Summarize syscall counts and latencies. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -S, --syscalls show per-syscall statistics -+ -P, --pids show per-thread syscall statistics -+ -D, --dist show distributions as histograms -+ -s SYSCALL, --syscall SYSCALL -+ show statistics of specific syscall only -+ -p PID, --pid PID show syscall statistics of specific PID only -+ -+examples: -+ ./alisyscount # sum syscall counts and latencies. -+ ./alisyscount -S # show per-syscall statistics -+ ./alisyscount -P # show per-thread syscall statistics -+ ./alisyscount -D # show syscall statistics as histograms -+ ./alisyscount -s 0 # show syscall statistics of syscall 0 only -+ ./alisyscount -p 25 # show syscall statistics of PID 25 only -+ ./alisyscount 1 10 # print 1 second summaries, 10 times -+ ./alisyscount -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/alisysdelay.py b/tools/alisysdelay.py -new file mode 100755 -index 0000000..e4c7952 ---- /dev/null -+++ b/tools/alisysdelay.py -@@ -0,0 +1,297 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisysdelay Catch tasks hung in kernel mode and get its stack -+# For Linux, uses BCC, BPF, perf_events. Embedded C. -+# -+# In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+# may be unacceptable, because some tasks may run too long in kernel mode -+# without calling cond_sched(). These tasks may be kernel thread, or user -+# processes that enter kernel mode by syscall. -+# -+# alisysdelay can catch tasks that have been running continuously for a long time -+# whithout scheduling, and kernel/user stackes of these tasks will be printed. -+# The threshold can be user defined. -+# -+# By default CPU idle stacks are excluded by simply excluding PID 0. -+# -+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). -+# -+# USAGE: alisysdelay [-h] [-d] [-a] [-f frequency] [-t threshold] [--stack-storage-size size] -+# -+# Copyright (c) 2016 Brendan Gregg, Netflix, Inc. -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF, PerfType, PerfSWConfig -+import argparse -+import errno -+import ctypes as ct -+ -+ -+# arg validation -+def positive_int(val): -+ try: -+ ival = int(val) -+ except ValueError: -+ raise argparse.ArgumentTypeError("must be an integer") -+ -+ if ival < 0: -+ raise argparse.ArgumentTypeError("must be positive") -+ return ival -+ -+def positive_nonzero_int(val): -+ ival = positive_int(val) -+ if ival == 0: -+ raise argparse.ArgumentTypeError("must be nonzero") -+ return ival -+ -+def stack_id_err(stack_id): -+ # -EFAULT in get_stackid normally means the stack-trace is not availible, -+ # Such as getting kernel stack trace in userspace code -+ return (stack_id < 0) and (stack_id != -errno.EFAULT) -+ -+# arguments -+examples = """examples: -+ ./alisysdelay # catch tasks hung in kernel mode -+ ./alisysdelay -f 99 # catch tasks hung in kernel mode at 99 Hertz -+ ./alisysdelay -t 100 # catch tasks that hung in kernel mode for more than 100us -+""" -+parser = argparse.ArgumentParser( -+ description="Catch tasks hung in kernel mode and get its stack", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-f", "--frequency", type=positive_nonzero_int, default=100, -+ help="sample frequency in Hertz " -+ "(default %(default)s)") -+parser.add_argument("-d", "--delimited", action="store_true", -+ help="insert delimiter between kernel/user stacks") -+parser.add_argument("-a", "--annotations", action="store_true", -+ help="add _[k] annotations to kernel frames") -+parser.add_argument("-t", "--threshold", type=positive_nonzero_int, default=50, -+ help="set the threshold of delay in kernel mode, in us " -+ "(default %(default)s)") -+parser.add_argument("--stack-storage-size", type=positive_nonzero_int, default=16384, -+ help="the number of unique stack traces that can be stored and displayed " -+ "(default %(default)s)") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+# option logic -+args = parser.parse_args() -+debug = 0 -+ -+ -+# define BPF program -+bpf_text = """ -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct event { -+ u32 pid; -+ u64 kernel_ip; -+ int user_stack_id; -+ int kernel_stack_id; -+ char comm[TASK_COMM_LEN]; -+} event_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); -+BPF_PERF_OUTPUT(events); -+ -+static inline void update_start_time() -+{ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+} -+ -+TRACEPOINT_PROBE(raw_syscalls, sys_enter) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+TRACEPOINT_PROBE(sched, sched_switch) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+int kprobe___cond_resched(struct pt_regs *ctx) -+{ -+ update_start_time(); -+ return 0; -+} -+ -+// This is copy from arch/x86/asm/ptrace.h -+static inline int user_mode_dump(struct pt_regs *regs) -+{ -+#ifdef CONFIG_X86_32 -+ u64 cs, flags; -+ bpf_probe_read(&cs, sizeof(u64), ®s->cs); -+ bpf_probe_read(&flags, sizeof(u64), ®s->flags); -+ return ((cs & SEGMENT_RPL_MASK) | (flags & X86_VM_MASK)) >= USER_RPL; -+#else -+ u64 cs; -+ bpf_probe_read(&cs, sizeof(u64), ®s->cs); -+ return !!( cs & 3); -+#endif -+} -+ -+static bool need_check_delay(struct pt_regs *regs) -+{ -+ // ignore process 0 -+ if (bpf_get_current_pid_tgid() == 0) { -+ return 0; -+ } -+ -+ // skip when currently cpu is in user mode -+ if (user_mode_dump(regs)) { -+ return 0; -+ } -+ -+ return 1; -+} -+ -+static inline void get_stack_id(struct bpf_perf_event_data *ctx) -+{ -+ event_t key = {.pid = bpf_get_current_pid_tgid()}; -+ bpf_get_current_comm(&key.comm, sizeof(key.comm)); -+ -+ // get stacks -+ key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK); -+ key.kernel_stack_id = stack_traces.get_stackid(&ctx->regs, 0); -+ -+ if (key.kernel_stack_id >= 0) { -+ // populate extras to fix the kernel stack -+ u64 ip = PT_REGS_IP(&ctx->regs); -+ u64 page_offset; -+ -+ // if ip isn't sane, leave key ips as zero for later checking -+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE) -+ // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it -+ page_offset = __PAGE_OFFSET_BASE; -+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4) -+ // x64, 4.17, and later -+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL) -+ page_offset = __PAGE_OFFSET_BASE_L5; -+#else -+ page_offset = __PAGE_OFFSET_BASE_L4; -+#endif -+#else -+ // earlier x86_64 kernels, e.g., 4.6, comes here -+ // arm64, s390, powerpc, x86_32 -+ page_offset = PAGE_OFFSET; -+#endif -+ -+ if (ip > page_offset) { -+ key.kernel_ip = ip; -+ } -+ } -+ -+ events.perf_submit(ctx, &key, sizeof(event_t)); -+} -+ -+int sys_timer(struct bpf_perf_event_data *ctx) { -+ -+ if (need_check_delay(&ctx->regs)) { -+ u32 idx = 0; -+ u64 *tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+ if ((bpf_ktime_get_ns() - *tsp) >= (DELAY_THRESHOLD)) { -+ get_stack_id(ctx); -+ } -+ } -+ else { -+ update_start_time(); -+ } -+ -+ return 0; -+} -+""" -+ -+# set stack storage size -+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) -+bpf_text = bpf_text.replace('DELAY_THRESHOLD', str(args.threshold * 1000)) -+ -+ -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+# initialize BPF & perf_events -+b = BPF(text=bpf_text) -+b.attach_perf_event(ev_type=PerfType.SOFTWARE, -+ ev_config=PerfSWConfig.CPU_CLOCK, fn_name="sys_timer", -+ sample_freq=args.frequency, pid=-1, cpu=-1, group_fd=-1) -+ -+ -+def aksym(addr): -+ if args.annotations: -+ return b.ksym(addr) + "_[k]".encode() -+ else: -+ return b.ksym(addr) -+ -+TASK_COMM_LEN = 16 -+class Record(ct.Structure): -+ _fields_ = [("pid", ct.c_ulong), -+ ("kernel_ip", ct.c_ulonglong), -+ ("user_stack_id", ct.c_int), -+ ("kernel_stack_id", ct.c_int), -+ ("comm", ct.c_char * TASK_COMM_LEN)] -+ -+ -+def print_stack(cpu, data, size): -+ k = ct.cast(data, ct.POINTER(Record)).contents -+ -+ user_stack = [] if k.user_stack_id < 0 else stack_traces.walk(k.user_stack_id) -+ kernel_tmp = [] if k.kernel_stack_id < 0 else stack_traces.walk(k.kernel_stack_id) -+ -+ # fix kernel stack -+ kernel_stack = [] -+ if k.kernel_stack_id >= 0: -+ for addr in kernel_tmp: -+ kernel_stack.append(addr) -+ # the later IP checking -+ if k.kernel_ip: -+ kernel_stack.insert(0, k.kernel_ip) -+ -+ # print default multi-line stack output -+ if stack_id_err(k.kernel_stack_id): -+ print(" [Missed Kernel Stack]") -+ else: -+ for addr in kernel_stack: -+ print(" %s" % aksym(addr)) -+ if args.delimited and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: -+ print(" --") -+ if stack_id_err(k.user_stack_id): -+ print(" [Missed User Stack]") -+ else: -+ for addr in user_stack: -+ print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) -+ print(" %-16s %s (%d)" % ("-", k.comm.decode('utf-8', 'replace'), k.pid)) -+ print() -+ -+ -+# header -+print("Sampling at %s Hz frequency" % args.frequency, end="") -+print("... Hit Ctrl-C to end.") -+ -+# collect samples -+b["events"].open_perf_buffer(print_stack) -+stack_traces = b["stack_traces"] -+ -+while (1): -+ try: -+ b.perf_buffer_poll() -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/alisysdelay_example.txt b/tools/alisysdelay_example.txt -new file mode 100644 -index 0000000..6f8f593 ---- /dev/null -+++ b/tools/alisysdelay_example.txt -@@ -0,0 +1,75 @@ -+alisysdelay is used to catch tasks hung in kernel mode and get its stack. -+ -+In kernel without CONFIG_PREEMPT enabled, scheduling latency sometimes -+may be unacceptable, because some tasks may run too long in kernel mode -+without calling cond_sched(). These tasks may be kernel thread, or user -+processes that enter kernel mode by syscall. -+ -+alisysdelay can catch tasks that have been running continuously for a long time -+whithout scheduling, and kernel/user stackes will be printed for these tasks. -+The threshold can be user defined. -+ -+By default CPU idle stacks are excluded by simply excluding PID 0. -+ -+ -+ -+For example, -+ -+# alisysdelay -t 10 -+Sampling at 100 Hz frequency... Hit Ctrl-C to end. -+ -+ iowrite16 -+ iowrite16 -+ vp_notify -+ virtqueue_notify -+ start_xmit -+ dev_hard_start_xmit -+ sch_direct_xmit -+ __qdisc_run -+ __dev_queue_xmit -+ ip_finish_output2 -+ ip_output -+ __tcp_transmit_skb -+ tcp_write_xmit -+ __tcp_push_pending_frames -+ tcp_sendmsg_locked -+ tcp_sendmsg -+ sock_sendmsg -+ sock_write_iter -+ __vfs_write -+ vfs_write -+ ksys_write -+ do_syscall_64 -+ entry_SYSCALL_64_after_hwframe -+ __write_nocancel -+ - sshd (4617) -+ -+In this case, sshd (PID 4617) is catched for staying in kernel mode -+for more than 10us, without rescheduling. -+ -+ -+ -+USAGE message: -+# alisysdelay -h -+usage: alisysdelay.py [-h] [-f FREQUENCY] [-d] [-a] [-t THRESHOLD] -+ [--stack-storage-size STACK_STORAGE_SIZE] -+ -+Catch tasks hung in kernel mode and get its stack -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -f FREQUENCY, --frequency FREQUENCY -+ sample frequency in Hertz (default 100) -+ -d, --delimited insert delimiter between kernel/user stacks -+ -a, --annotations add _[k] annotations to kernel frames -+ -t THRESHOLD, --threshold THRESHOLD -+ set the threshold of delay in kernel mode, in us -+ (default 50) -+ --stack-storage-size STACK_STORAGE_SIZE -+ the number of unique stack traces that can be stored -+ and displayed (default 16384) -+ -+examples: -+ ./alisysdelay # catch tasks hung in kernel mode -+ ./alisysdelay -f 99 # catch tasks hung in kernel mode at 99 Hertz -+ ./alisysdelay -t 100 # catch tasks that hung in kernel mode for more than 100us -diff --git a/tools/alisyslatency.py b/tools/alisyslatency.py -new file mode 100755 -index 0000000..05bfdbd ---- /dev/null -+++ b/tools/alisyslatency.py -@@ -0,0 +1,171 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# alisyslatency Summarize time spent in kernel context. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: alisyslatency [-h] [-T] [-N] [-c CPU] [-p PID] [interval] [outputs] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+from time import sleep, strftime -+import argparse -+ -+# arguments -+examples = """examples: -+ ./alisyslatency # sum time in kernel context per CPU -+ ./alisyslatency -c 0 # show time in kernel context in CPU 0 only -+ ./alisyslatency -p 25 # show time in kernel context in pid 25 only -+ ./alisyslatency 1 10 # print 1 second summaries, 10 times -+ ./alisyslatency -NT 1 # 1s summaries, nanoseconds, and timestamps -+""" -+ -+parser = argparse.ArgumentParser( -+ description="Summarize time spent in kernel context.", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("-T", "--timestamp", action="store_true", -+ help="include timestamp on output") -+parser.add_argument("-N", "--nanoseconds", action="store_true", -+ help="output in nanoseconds") -+ -+thread_group = parser.add_mutually_exclusive_group() -+thread_group.add_argument("-c", "--cpu", -+ help="output statistics on specific CPU only") -+thread_group.add_argument("-p", "--pid", -+ help="output statistics of specific pid only") -+ -+parser.add_argument("interval", nargs="?", default=99999999, -+ help="output interval, in seconds") -+parser.add_argument("outputs", nargs="?", default=99999999, -+ help="number of outputs") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+countdown = int(args.outputs) -+if args.nanoseconds: -+ factor = 1 -+ label = "nsecs" -+else: -+ factor = 1000 -+ label = "usecs" -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct map_val { -+ u64 count; -+ u64 time; -+ u64 time_max; -+} map_val_t; -+ -+BPF_PERCPU_ARRAY(start, u64, 1); -+BPF_HASH(res, u32, map_val_t); -+ -+ -+TRACEPOINT_PROBE(context_tracking, user_exit) -+{ -+ COND_FILTER -+ -+ u32 idx = 0; -+ u64 ts = bpf_ktime_get_ns(); -+ start.update(&idx, &ts); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(context_tracking, user_enter) -+{ -+ COND_FILTER -+ -+ u64 *tsp, delta; -+ u32 idx = 0; -+ // fetch timestamp and calculate delta -+ tsp = start.lookup(&idx); -+ if (!tsp || *tsp == 0) { -+ return 0; // missed start -+ } -+ -+#if STATISTICS_PER_PROCESS -+ u32 key = bpf_get_current_pid_tgid(); -+#else -+ u32 key = bpf_get_smp_processor_id(); -+#endif -+ -+ delta = bpf_ktime_get_ns() - *tsp; -+ map_val_t *valp, val; -+ -+ valp = res.lookup(&key); -+ if (valp) { -+ valp->count += 1; -+ valp->time += delta; -+ if (valp->time_max < delta) {valp->time_max = delta;} -+ } -+ else { -+ val.count = 1; -+ val.time = val.time_max = delta; -+ res.update(&key, &val); -+ } -+ -+ u64 zero = 0; -+ start.update(&idx, &zero); -+ -+ return 0; -+} -+""" -+ -+cond_exp = "1" -+if args.cpu: -+ cond_exp = "bpf_get_smp_processor_id() == %s" % args.cpu -+ -+if args.pid: -+ cond_exp = "(u32)bpf_get_current_pid_tgid() == %s" % args.pid -+ -+cond_filter = "if (!(%s)) {return 0;}" % cond_exp -+bpf_text = bpf_text.replace('COND_FILTER', cond_filter) -+ -+bpf_text = bpf_text.replace('STATISTICS_PER_PROCESS', "1" if args.cpu else "0") -+ -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing time in kernel context... Hit Ctrl-C to end.") -+tab = b.get_table("res") -+ -+ -+# output -+while (1): -+ try: -+ sleep(int(args.interval)) -+ except KeyboardInterrupt: -+ countdown = 1 -+ print() -+ if args.timestamp: -+ print("%-8s\n" % strftime("%H:%M:%S"), end="") -+ -+ # print header -+ print("%10s %10s %10s %10s" % -+ ("PID" if args.cpu else "CPU", "Count", "TOTAL_" + label, "MAX_" + label)) -+ -+ for k, v in sorted(tab.items(), key=lambda res: res[0]): -+ print("%10d %10d %10d %10d" % -+ (k, v.count, v.time / factor, v.time_max / factor)) -+ -+ tab.clear() -+ -+ countdown -= 1 -+ if countdown == 0: -+ exit() -diff --git a/tools/alisyslatency_example.txt b/tools/alisyslatency_example.txt -new file mode 100644 -index 0000000..f5dd2c7 ---- /dev/null -+++ b/tools/alisyslatency_example.txt -@@ -0,0 +1,72 @@ -+Demonstrations of alisyslatency, the Linux eBPF/bcc version. -+ -+ -+This program traces time spent in kernel context (between -+context_tracking:user_exit and context_tracking:user_enter), -+and stores timing statistics in-kernel for efficiency. -+ -+ -+**Note** This tool needs Context tracking subsystem, which can not -+work without support of full dynticks. -+ -+ -+Example 1 -+ -+By default, all statistics of all processes on all CPUs are sampled, -+and the output is displayed in a per-CPU flavor. -+ -+# alisyslatency -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs MAX_usecs -+ -+ -+Example 2 -+ -+You can sample statistics on specified CPU only with "-c" option, -+and the output will be displayed in a per-process flavor. -+ -+# alisyslatency -c 6 -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ PID Count TOTAL_usecs MAX_usecs -+ -+ -+ -+Exampel 3 -+ -+You can sample statistics of specified process only with "-p" option, -+and the output will be displayed in a per-CPU flavor. -+ -+ -+# alisyslatency -p 3550 -+Tracing time in kernel context... Hit Ctrl-C to end. -+^C -+ CPU Count TOTAL_usecs MAX_usecs -+ -+ -+ -+USAGE message: -+ -+# alisyslatency -h -+usage: alisyslatency.py [-h] [-T] [-N] [-c CPU | -p PID] [interval] [outputs] -+ -+Summarize time spent in kernel context. -+ -+positional arguments: -+ interval output interval, in seconds -+ outputs number of outputs -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -T, --timestamp include timestamp on output -+ -N, --nanoseconds output in nanoseconds -+ -c CPU, --cpu CPU output statistics on specific CPU only -+ -p PID, --pid PID output statistics of specific pid only -+ -+examples: -+ ./alisyslatency # sum time in kernel context per CPU -+ ./alisyslatency -c 0 # show time in kernel context in CPU 0 only -+ ./alisyslatency -p 25 # show time in kernel context in pid 25 only -+ ./alisyslatency 1 10 # print 1 second summaries, 10 times -+ ./alisyslatency -NT 1 # 1s summaries, nanoseconds, and timestamps -diff --git a/tools/aliworkslower.py b/tools/aliworkslower.py -new file mode 100755 -index 0000000..6c7fffe ---- /dev/null -+++ b/tools/aliworkslower.py -@@ -0,0 +1,137 @@ -+#!/usr/bin/python -+# @lint-avoid-python-3-compatibility-imports -+# -+# aliworkslower Trace high work sched latency. -+# For Linux, uses BCC, eBPF. -+# -+# USAGE: aliworkslower [-h] [threshold] -+# -+# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc. -+# Licensed under the Apache License, Version 2.0 (the "License") -+ -+from __future__ import print_function -+from bcc import BPF -+import argparse -+import ctypes as ct -+ -+# arguments -+examples = """examples: -+ ./aliworkslower # trace work sched latency higher than 10000 us (default) -+ ./aliworkslower 1000 # trace work sched latency higher than 1000 us -+""" -+parser = argparse.ArgumentParser( -+ description="Trace high work sched latency", -+ formatter_class=argparse.RawDescriptionHelpFormatter, -+ epilog=examples) -+parser.add_argument("threshold", nargs="?", type=int, default=10000, -+ help="threshold of Q2S latency, in microsecond") -+parser.add_argument("--ebpf", action="store_true", -+ help=argparse.SUPPRESS) -+ -+args = parser.parse_args() -+debug = 0 -+ -+# define BPF program -+bpf_text = """ -+#include -+ -+typedef struct work_timestamp { -+ u64 time_queue_work; -+ u64 time_activate_work; -+} work_timestamp_t; -+ -+typedef struct work_latency { -+ u64 pid; // pid of worker thread -+ u64 func; // function pointer of the work callback -+ u64 q2a; // queue_work to activate_work -+ u64 a2s; // activate_work to execute_start -+} work_latency_t; -+ -+BPF_HASH(res, u64, work_timestamp_t); -+BPF_PERF_OUTPUT(events); -+ -+TRACEPOINT_PROBE(workqueue, workqueue_queue_work) -+{ -+ u64 work = (u64)args->work; -+ work_timestamp_t stamp = { -+ .time_queue_work = bpf_ktime_get_ns(), -+ }; -+ res.update(&work, &stamp); -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(workqueue, workqueue_activate_work) -+{ -+ u64 work = (u64)args->work; -+ work_timestamp_t *stampp = res.lookup(&work); -+ -+ if (stampp) { -+ stampp->time_activate_work = bpf_ktime_get_ns(); -+ } -+ -+ return 0; -+} -+ -+TRACEPOINT_PROBE(workqueue, workqueue_execute_start) -+{ -+ u64 work = (u64)args->work; -+ u64 now = bpf_ktime_get_ns(); -+ work_timestamp_t *stampp = res.lookup(&work); -+ -+ if (stampp) { -+ u64 delta = now - stampp->time_queue_work; -+ if (delta > THRESHOLD) { -+ work_latency_t latency = { -+ .pid = (u32)bpf_get_current_pid_tgid(), -+ .func = (u64)args->function, -+ .q2a = stampp->time_activate_work - stampp->time_queue_work, -+ .a2s = now - stampp->time_activate_work, -+ }; -+ events.perf_submit(args, &latency, sizeof(work_latency_t)); -+ } -+ } -+ -+ return 0; -+} -+""" -+ -+bpf_text = bpf_text.replace('THRESHOLD', "%d" % (args.threshold * 1000)) -+ -+# output eBPF program C code after it is replaced, used by debugging -+if debug or args.ebpf: -+ print(bpf_text) -+ if args.ebpf: -+ exit() -+ -+ -+class Record(ct.Structure): -+ _fields_ = [("pid", ct.c_ulong), -+ ("func", ct.c_ulonglong), -+ ("q2a", ct.c_ulonglong), -+ ("a2s", ct.c_ulonglong)] -+ -+ -+def print_record(cpu, data, size): -+ event = ct.cast(data, ct.POINTER(Record)).contents -+ print("%10s %20s %25d %25d %25d" % (event.pid, -+ b.ksym(event.func), -+ (event.q2a + event.a2s) / 1000, -+ event.q2a / 1000, -+ event.a2s / 1000)) -+ -+# load BPF program -+b = BPF(text=bpf_text) -+print("Tracing high work sched latency... Hit Ctrl-C to end.") -+print("%10s %20s %25s %25s %25s" % -+ ("Worker Thread", "Work", "Total Q2S latency (us)", "Q2A latency (us)", "A2S latency (us)")) -+ -+b["events"].open_perf_buffer(print_record) -+ -+ -+# output -+while (1): -+ try: -+ b.perf_buffer_poll() -+ except KeyboardInterrupt: -+ exit() -diff --git a/tools/aliworkslower_example.txt b/tools/aliworkslower_example.txt -new file mode 100644 -index 0000000..21775d0 ---- /dev/null -+++ b/tools/aliworkslower_example.txt -@@ -0,0 +1,39 @@ -+aliworkslower shows high latency of work between workqueue_queue_work -+and workqueue_execute_start. -+ -+Users can specify a threshhold, and whenever the latency between -+workqueue_queue_work and workqueue_execute_start exceeds this -+threshold, related information of this work is outputed. -+ -+For example, -+ -+# ./aliworkslower 15 -+Tracing high work sched latency... Hit Ctrl-C to end. -+Worker Thread Work Total Q2S latency (us) Q2A latency (us) A2S latency (us) -+ 19783 vmstat_update 36 0 35 -+ 29195 vmstat_update 16 0 15 -+ -+- column "Worker Thread": PID of the worker thread -+- column "Work": work callback -+- column "Total Q2S latency": latency between workqueue_queue_work (Q) -+and workqueue_execute_start (S), in us -+- column "Q2A latency": latency between workqueue_queue_work (Q) -+and workqueue_activate_work (A), in us -+- column "A2S latency": latency between workqueue_activate_work (A) -+and workqueue_execute_start (S), in us -+ -+ -+# aliworkslower --help -+usage: aliworkslower.py [-h] [threshold] -+ -+Trace high work sched latency -+ -+positional arguments: -+ threshold threshold of Q2S latency, in microsecond -+ -+optional arguments: -+ -h, --help show this help message and exit -+ -+examples: -+ ./aliworkslower # trace work sched latency higher than 10000 us (default) -+ ./aliworkslower 1000 # trace work sched latency higher than 1000 us --- -2.27.0 - diff --git a/bcc-0.19.0-Define-missing-BPF_-macros.patch b/bcc-0.19.0-Define-missing-BPF_-macros.patch deleted file mode 100644 index 01d524d..0000000 --- a/bcc-0.19.0-Define-missing-BPF_-macros.patch +++ /dev/null @@ -1,27 +0,0 @@ -From d25f87677b554d9cbdc609c020a2baa59e442443 Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Wed, 2 Jun 2021 10:57:43 +0200 -Subject: [PATCH] Define missing BPF_* macros - -We're currently missing BPF_MAP_TYPE_TASK_STORAGE. This could be -fixed by a future rebase. In the mean time, let's have a temporary -fix. - -diff --git a/introspection/bps.c b/introspection/bps.c -index 0eae675e..1108e417 100644 ---- a/introspection/bps.c -+++ b/introspection/bps.c -@@ -49,6 +49,10 @@ static const char * const prog_type_strings[] = { - [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", - }; - -+#ifndef BPF_MAP_TYPE_TASK_STORAGE -+#define BPF_MAP_TYPE_TASK_STORAGE (BPF_MAP_TYPE_INODE_STORAGE + 1) -+#endif -+ - static const char * const map_type_strings[] = { - [BPF_MAP_TYPE_UNSPEC] = "unspec", - [BPF_MAP_TYPE_HASH] = "hash", --- -2.31.1 - diff --git a/bcc-0.19.0-Fix-BPF-src_file-foo.patch b/bcc-0.19.0-Fix-BPF-src_file-foo.patch deleted file mode 100644 index 4154b4b..0000000 --- a/bcc-0.19.0-Fix-BPF-src_file-foo.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 9051043a126a838902b88d144eea1b631d2c3eef Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Tue, 27 Apr 2021 15:13:12 +0200 -Subject: [PATCH] Fix BPF(src_file="foo") - -Since commit 75f20a15 ("Use string type for comparison to PATH -elements"), src_file isn't working anymore. Somehow, two wrongs -(ArgString __str__() returning a bytes object and joining a bytes and -what was supposed to be a string) did make a right. - -It fixes the following error in netqtop and deadlock: -Traceback (most recent call last): - File "/usr/share/bcc/tools/netqtop", line 207, in - b = BPF(src_file = EBPF_FILE) - File "/usr/lib/python3.6/site-packages/bcc/__init__.py", line 335, in __init__ - src_file = BPF._find_file(src_file) - File "/usr/lib/python3.6/site-packages/bcc/__init__.py", line 255, in _find_file - t = b"/".join([os.path.abspath(os.path.dirname(argv0.__str__())), filename]) -TypeError: sequence item 0: expected a bytes-like object, str found ---- - src/python/bcc/__init__.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py -index 90562cd7..eabe0a55 100644 ---- a/src/python/bcc/__init__.py -+++ b/src/python/bcc/__init__.py -@@ -252,7 +252,7 @@ DEBUG_BTF = 0x20 - if filename: - if not os.path.isfile(filename): - argv0 = ArgString(sys.argv[0]) -- t = b"/".join([os.path.abspath(os.path.dirname(argv0.__str__())), filename]) -+ t = b"/".join([os.path.abspath(os.path.dirname(argv0.__bytes__())), filename]) - if os.path.isfile(t): - filename = t - else: --- -2.30.2 - diff --git a/bcc-0.19.0-Fix-a-llvm-compilation-error.patch b/bcc-0.19.0-Fix-a-llvm-compilation-error.patch deleted file mode 100644 index e3febea..0000000 --- a/bcc-0.19.0-Fix-a-llvm-compilation-error.patch +++ /dev/null @@ -1,46 +0,0 @@ -From bb121e49b1a05e86c88274a89f5229b4ec6939c6 Mon Sep 17 00:00:00 2001 -From: Yonghong Song -Date: Tue, 25 May 2021 19:58:00 -0700 -Subject: [PATCH 2/2] Fix a llvm compilation error -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Current llvm trunk (https://github.com/llvm/llvm-project) -will cause the following compilation errors: - /home/yhs/work/bcc/src/cc/bcc_debug.cc: In member function ‘void ebpf::SourceDebugger::dump()’: - /home/yhs/work/bcc/src/cc/bcc_debug.cc:135:75: error: no matching function for call to - ‘llvm::MCContext::MCContext(llvm::Triple&, std::unique_ptr::pointer, - std::unique_ptr::pointer, llvm::MCObjectFileInfo*, - std::unique_ptr::pointer, std::nullptr_t)’ - MCContext Ctx(TheTriple, MAI.get(), MRI.get(), &MOFI, STI.get(), nullptr); - ^ - ...... - -This is because upstream patch https://reviews.llvm.org/D101921 -refactored MCObjectFileInfo initialization and changed MCContext -constructor signature. - -This patch fixed the issue by following the new code patterns -in https://reviews.llvm.org/D101921. ---- - src/cc/bcc_debug.cc | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc -index 775c9141..97d6d95b 100644 ---- a/src/cc/bcc_debug.cc -+++ b/src/cc/bcc_debug.cc -@@ -132,7 +132,8 @@ void SourceDebugger::dump() { - T->createMCSubtargetInfo(TripleStr, "", "")); - MCObjectFileInfo MOFI; - #if LLVM_MAJOR_VERSION >= 13 -- MCContext Ctx(TheTriple, MAI.get(), MRI.get(), &MOFI, STI.get(), nullptr); -+ MCContext Ctx(TheTriple, MAI.get(), MRI.get(), STI.get(), nullptr); -+ Ctx.setObjectFileInfo(&MOFI); - MOFI.initMCObjectFileInfo(Ctx, false, false); - #else - MCContext Ctx(MAI.get(), MRI.get(), &MOFI, nullptr); --- -2.31.1 - diff --git a/bcc-0.19.0-Handle-renaming-of-task_struct_-state-field-on-RHEL-.patch b/bcc-0.19.0-Handle-renaming-of-task_struct_-state-field-on-RHEL-.patch deleted file mode 100644 index ef6ef95..0000000 --- a/bcc-0.19.0-Handle-renaming-of-task_struct_-state-field-on-RHEL-.patch +++ /dev/null @@ -1,110 +0,0 @@ -From c610314e8f8265317bf54ef518df48809834feba Mon Sep 17 00:00:00 2001 -From: Jerome Marchand -Date: Thu, 14 Oct 2021 12:01:01 +0200 -Subject: [PATCH] Handle renaming of task_struct_>state field on RHEL 9 - -There has been some cleanup of task_struct's state field and to catch -any place that has been missed in the conversion, it has been renamed -__state. ---- - tools/cpudist.py | 2 +- - tools/offcputime.py | 4 ++-- - tools/offwaketime.py | 4 ++-- - tools/runqlat.py | 4 ++-- - tools/runqslower.py | 4 ++-- - 5 files changed, 9 insertions(+), 9 deletions(-) - -diff --git a/tools/cpudist.py b/tools/cpudist.py -index eb04f590..ba36ba76 100755 ---- a/tools/cpudist.py -+++ b/tools/cpudist.py -@@ -101,7 +101,7 @@ int sched_switch(struct pt_regs *ctx, struct task_struct *prev) - u32 tgid = pid_tgid >> 32, pid = pid_tgid; - - #ifdef ONCPU -- if (prev->state == TASK_RUNNING) { -+ if (prev->__state == TASK_RUNNING) { - #else - if (1) { - #endif -diff --git a/tools/offcputime.py b/tools/offcputime.py -index 128c6496..b93e78d2 100755 ---- a/tools/offcputime.py -+++ b/tools/offcputime.py -@@ -205,10 +205,10 @@ thread_context = "" - thread_context = "all threads" - thread_filter = '1' - if args.state == 0: -- state_filter = 'prev->state == 0' -+ state_filter = 'prev->__state == 0' - elif args.state: - # these states are sometimes bitmask checked -- state_filter = 'prev->state & %d' % args.state -+ state_filter = 'prev->__state & %d' % args.state - else: - state_filter = '1' - bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) -diff --git a/tools/offwaketime.py b/tools/offwaketime.py -index 753eee97..722c0381 100755 ---- a/tools/offwaketime.py -+++ b/tools/offwaketime.py -@@ -254,10 +254,10 @@ int oncpu(struct pt_regs *ctx, struct task_struct *p) { - else: - thread_filter = '1' - if args.state == 0: -- state_filter = 'p->state == 0' -+ state_filter = 'p->__state == 0' - elif args.state: - # these states are sometimes bitmask checked -- state_filter = 'p->state & %d' % args.state -+ state_filter = 'p->__state & %d' % args.state - else: - state_filter = '1' - bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) -diff --git a/tools/runqlat.py b/tools/runqlat.py -index b13ff2d1..8e443c3c 100755 ---- a/tools/runqlat.py -+++ b/tools/runqlat.py -@@ -116,7 +116,7 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev) - u32 pid, tgid; - - // ivcsw: treat like an enqueue event and store timestamp -- if (prev->state == TASK_RUNNING) { -+ if (prev->__state == TASK_RUNNING) { - tgid = prev->tgid; - pid = prev->pid; - if (!(FILTER || pid == 0)) { -@@ -170,7 +170,7 @@ RAW_TRACEPOINT_PROBE(sched_switch) - u32 pid, tgid; - - // ivcsw: treat like an enqueue event and store timestamp -- if (prev->state == TASK_RUNNING) { -+ if (prev->__state == TASK_RUNNING) { - tgid = prev->tgid; - pid = prev->pid; - if (!(FILTER || pid == 0)) { -diff --git a/tools/runqslower.py b/tools/runqslower.py -index 6df98d9f..ba71e5d3 100755 ---- a/tools/runqslower.py -+++ b/tools/runqslower.py -@@ -112,7 +112,7 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev) - u32 pid, tgid; - - // ivcsw: treat like an enqueue event and store timestamp -- if (prev->state == TASK_RUNNING) { -+ if (prev->__state == TASK_RUNNING) { - tgid = prev->tgid; - pid = prev->pid; - u64 ts = bpf_ktime_get_ns(); -@@ -178,7 +178,7 @@ RAW_TRACEPOINT_PROBE(sched_switch) - long state; - - // ivcsw: treat like an enqueue event and store timestamp -- bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->state); -+ bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->__state); - if (state == TASK_RUNNING) { - bpf_probe_read_kernel(&tgid, sizeof(prev->tgid), &prev->tgid); - bpf_probe_read_kernel(&pid, sizeof(prev->pid), &prev->pid); --- -2.31.1 - diff --git a/bcc-0.19.0-Remove-APInt-APSInt-toString-std-string-variants.patch b/bcc-0.19.0-Remove-APInt-APSInt-toString-std-string-variants.patch deleted file mode 100644 index 161f5e3..0000000 --- a/bcc-0.19.0-Remove-APInt-APSInt-toString-std-string-variants.patch +++ /dev/null @@ -1,41 +0,0 @@ -From da1f8b4b8389e463e323885e402f96b3bc6ceb35 Mon Sep 17 00:00:00 2001 -From: Khem Raj -Date: Mon, 14 Jun 2021 12:49:43 -0700 -Subject: [PATCH] Remove APInt/APSInt toString() std::string variants - -clang 13+ has removed this in favour of a pair of llvm::toString -() helpers inside StringExtras.h to improve compile speed by avoiding -hits on header - -Signed-off-by: Khem Raj ---- - src/cc/json_map_decl_visitor.cc | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/src/cc/json_map_decl_visitor.cc b/src/cc/json_map_decl_visitor.cc -index eff4d067..53896199 100644 ---- a/src/cc/json_map_decl_visitor.cc -+++ b/src/cc/json_map_decl_visitor.cc -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include - #include "common.h" - #include "table_desc.h" - -@@ -79,7 +80,11 @@ void BMapDeclVisitor::genJSONForField(FieldDecl *F) { - result_ += "["; - TraverseDecl(F); - if (const ConstantArrayType *T = dyn_cast(F->getType())) -+#if LLVM_MAJOR_VERSION >= 13 -+ result_ += ", [" + toString(T->getSize(), 10, false) + "]"; -+#else - result_ += ", [" + T->getSize().toString(10, false) + "]"; -+#endif - if (F->isBitField()) - result_ += ", " + to_string(F->getBitWidthValue(C)); - result_ += "], "; --- -2.31.1 - diff --git a/bcc-0.19.0-fix-llvm-compilation-errors.patch b/bcc-0.19.0-fix-llvm-compilation-errors.patch deleted file mode 100644 index 3c3376a..0000000 --- a/bcc-0.19.0-fix-llvm-compilation-errors.patch +++ /dev/null @@ -1,42 +0,0 @@ -From d74c96d9423652d4467339ee24bb6db2e5df21cb Mon Sep 17 00:00:00 2001 -From: Yonghong Song -Date: Wed, 5 May 2021 19:11:13 -0700 -Subject: [PATCH 1/2] fix llvm compilation errors - -MCContext and InitMCObjectFileInfo name/signatures -are changed due to upstream patch - https://reviews.llvm.org/D101462 -Adjust related codes in bcc_debug.cc properly to resolve -the compilation error for llvm13. - -Signed-off-by: Yonghong Song ---- - src/cc/bcc_debug.cc | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - -diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc -index 371b6ad3..775c9141 100644 ---- a/src/cc/bcc_debug.cc -+++ b/src/cc/bcc_debug.cc -@@ -128,11 +128,16 @@ void SourceDebugger::dump() { - return; - } - -+ std::unique_ptr STI( -+ T->createMCSubtargetInfo(TripleStr, "", "")); - MCObjectFileInfo MOFI; -+#if LLVM_MAJOR_VERSION >= 13 -+ MCContext Ctx(TheTriple, MAI.get(), MRI.get(), &MOFI, STI.get(), nullptr); -+ MOFI.initMCObjectFileInfo(Ctx, false, false); -+#else - MCContext Ctx(MAI.get(), MRI.get(), &MOFI, nullptr); - MOFI.InitMCObjectFileInfo(TheTriple, false, Ctx, false); -- std::unique_ptr STI( -- T->createMCSubtargetInfo(TripleStr, "", "")); -+#endif - - std::unique_ptr MCII(T->createMCInstrInfo()); - MCInstPrinter *IP = T->createMCInstPrinter(TheTriple, 0, *MAI, *MCII, *MRI); --- -2.31.1 - diff --git a/bcc-0.25.0-Fix-bpf_pseudo_fd-type-conversion-error.patch b/bcc-0.25.0-Fix-bpf_pseudo_fd-type-conversion-error.patch new file mode 100644 index 0000000..7ee0a2c --- /dev/null +++ b/bcc-0.25.0-Fix-bpf_pseudo_fd-type-conversion-error.patch @@ -0,0 +1,77 @@ +From 728005aac7c23590a406ac67235d915e416bff5d Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sat, 13 Aug 2022 17:50:07 -0700 +Subject: [PATCH 1/2] Fix bpf_pseudo_fd() type conversion error + +With llvm15 and llvm16, the following command line + sudo ./trace.py 'smp_call_function_single "%K", arg1' +will cause error: + /virtual/main.c:60:36: error: incompatible integer to pointer conversion passing 'u64' + (aka 'unsigned long long') to parameter of type 'void *' [-Wint-conversion] + bpf_perf_event_output(ctx, bpf_pseudo_fd(1, -1), CUR_CPU_IDENTIFIER, &__data, sizeof(__data)); + ^~~~~~~~~~~~~~~~~~~~ + 1 error generated. + Failed to compile BPF module + +In helpers.h, we have + u64 bpf_pseudo_fd(u64, u64) asm("llvm.bpf.pseudo"); +Apparently, <= llvm14 can tolerate u64 -> 'void *' conversion, but +llvm15 by default will cause an error. + +Let us explicitly convert bpf_pseudo_fd to 'void *' to avoid +such errors. + +Signed-off-by: Yonghong Song +--- + src/cc/frontends/clang/b_frontend_action.cc | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc +index a4e05b16..dbeba3e4 100644 +--- a/src/cc/frontends/clang/b_frontend_action.cc ++++ b/src/cc/frontends/clang/b_frontend_action.cc +@@ -957,7 +957,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { + string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange())); + string args_other = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(1)), + GET_ENDLOC(Call->getArg(2))))); +- txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")"; ++ txt = "bpf_perf_event_output(" + arg0 + ", (void *)bpf_pseudo_fd(1, " + fd + ")"; + txt += ", CUR_CPU_IDENTIFIER, " + args_other + ")"; + + // e.g. +@@ -986,7 +986,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { + string meta_len = rewriter_.getRewrittenText(expansionRange(Call->getArg(3)->getSourceRange())); + txt = "bpf_perf_event_output(" + + skb + ", " + +- "bpf_pseudo_fd(1, " + fd + "), " + ++ "(void *)bpf_pseudo_fd(1, " + fd + "), " + + "((__u64)" + skb_len + " << 32) | BPF_F_CURRENT_CPU, " + + meta + ", " + + meta_len + ");"; +@@ -1006,12 +1006,12 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { + string keyp = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange())); + string flag = rewriter_.getRewrittenText(expansionRange(Call->getArg(2)->getSourceRange())); + txt = "bpf_" + string(memb_name) + "(" + ctx + ", " + +- "bpf_pseudo_fd(1, " + fd + "), " + keyp + ", " + flag + ");"; ++ "(void *)bpf_pseudo_fd(1, " + fd + "), " + keyp + ", " + flag + ");"; + } else if (memb_name == "ringbuf_output") { + string name = string(Ref->getDecl()->getName()); + string args = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(0)), + GET_ENDLOC(Call->getArg(2))))); +- txt = "bpf_ringbuf_output(bpf_pseudo_fd(1, " + fd + ")"; ++ txt = "bpf_ringbuf_output((void *)bpf_pseudo_fd(1, " + fd + ")"; + txt += ", " + args + ")"; + + // e.g. +@@ -1033,7 +1033,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { + } else if (memb_name == "ringbuf_reserve") { + string name = string(Ref->getDecl()->getName()); + string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange())); +- txt = "bpf_ringbuf_reserve(bpf_pseudo_fd(1, " + fd + ")"; ++ txt = "bpf_ringbuf_reserve((void *)bpf_pseudo_fd(1, " + fd + ")"; + txt += ", " + arg0 + ", 0)"; // Flags in reserve are meaningless + } else if (memb_name == "ringbuf_discard") { + string name = string(Ref->getDecl()->getName()); +-- +2.38.1 + diff --git a/bcc-0.25.0-Fix-clang-15-int-to-pointer-conversion-errors.patch b/bcc-0.25.0-Fix-clang-15-int-to-pointer-conversion-errors.patch new file mode 100644 index 0000000..6b12079 --- /dev/null +++ b/bcc-0.25.0-Fix-clang-15-int-to-pointer-conversion-errors.patch @@ -0,0 +1,96 @@ +From f5a6c22f613d0566ba542f38f349be379e3844e8 Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Wed, 26 Oct 2022 14:41:54 +0200 +Subject: [PATCH 2/2] Fix clang 15 int to pointer conversion errors + +Since version 15, clang issues error for implicit conversion of +integer to pointer. Several tools are broken. This patch add explicit +pointer cast where needed. + +Fixes the following errors: +/virtual/main.c:37:18: error: incompatible integer to pointer conversion initializing 'struct request *' with an expression of type 'unsigned long' [-Wint-conversion] + struct request *req = ctx->di; + ^ ~~~~~~~ +/virtual/main.c:49:18: error: incompatible integer to pointer conversion initializing 'struct request *' with an expression of type 'unsigned long' [-Wint-conversion] + struct request *req = ctx->di; + ^ ~~~~~~~ +2 errors generated. + +/virtual/main.c:73:19: error: incompatible integer to pointer conversion initializing 'struct pt_regs *' with an expression of type 'unsigned long' [-Wint-conversion] + struct pt_regs * __ctx = ctx->di; + ^ ~~~~~~~ +/virtual/main.c:100:240: error: incompatible integer to pointer conversion passing 'u64' (aka 'unsigned long long') to parameter of type 'const void *' [-Wint-conversion] + data.ppid = ({ typeof(pid_t) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&({ typeof(struct task_struct *) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&task->real_parent); _val; })->tgid); _val; }); + ^~~~~~~~~~~~~~~~~~~~~~~ +/virtual/main.c:100:118: error: incompatible integer to pointer conversion passing 'u64' (aka 'unsigned long long') to parameter of type 'const void *' [-Wint-conversion] + data.ppid = ({ typeof(pid_t) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&({ typeof(struct task_struct *) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&task->real_parent); _val; })->tgid); _val; }); + ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Signed-off-by: Jerome Marchand +--- + src/cc/frontends/clang/b_frontend_action.cc | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc +index dbeba3e4..c0582464 100644 +--- a/src/cc/frontends/clang/b_frontend_action.cc ++++ b/src/cc/frontends/clang/b_frontend_action.cc +@@ -517,9 +517,9 @@ bool ProbeVisitor::VisitUnaryOperator(UnaryOperator *E) { + string pre, post; + pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));"; + if (cannot_fall_back_safely) +- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)"; ++ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)"; + else +- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)"; ++ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)"; + post = "); _val; })"; + rewriter_.ReplaceText(expansionLoc(E->getOperatorLoc()), 1, pre); + rewriter_.InsertTextAfterToken(expansionLoc(GET_ENDLOC(sub)), post); +@@ -581,9 +581,9 @@ bool ProbeVisitor::VisitMemberExpr(MemberExpr *E) { + string pre, post; + pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));"; + if (cannot_fall_back_safely) +- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)&"; ++ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)&"; + else +- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)&"; ++ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)&"; + post = rhs + "); _val; })"; + rewriter_.InsertText(expansionLoc(GET_BEGINLOC(E)), pre); + rewriter_.ReplaceText(expansionRange(SourceRange(member, GET_ENDLOC(E))), post); +@@ -635,9 +635,9 @@ bool ProbeVisitor::VisitArraySubscriptExpr(ArraySubscriptExpr *E) { + + pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));"; + if (cannot_fall_back_safely) +- pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (u64)(("; ++ pre += " bpf_probe_read_kernel(&_val, sizeof(_val), (void *)(("; + else +- pre += " bpf_probe_read(&_val, sizeof(_val), (u64)(("; ++ pre += " bpf_probe_read(&_val, sizeof(_val), (void *)(("; + if (isMemberDereference(base)) { + pre += "&"; + // If the base of the array subscript is a member dereference, we'll rewrite +@@ -747,8 +747,8 @@ void BTypeVisitor::genParamDirectAssign(FunctionDecl *D, string& preamble, + arg->addAttr(UnavailableAttr::CreateImplicit(C, "ptregs")); + size_t d = idx - 1; + const char *reg = calling_conv_regs[d]; +- preamble += " " + text + " = " + fn_args_[0]->getName().str() + "->" + +- string(reg) + ";"; ++ preamble += " " + text + " = (" + arg->getType().getAsString() + ")" + ++ fn_args_[0]->getName().str() + "->" + string(reg) + ";"; + } + } + } +@@ -762,7 +762,7 @@ void BTypeVisitor::genParamIndirectAssign(FunctionDecl *D, string& preamble, + + if (idx == 0) { + new_ctx = "__" + arg->getName().str(); +- preamble += " struct pt_regs * " + new_ctx + " = " + ++ preamble += " struct pt_regs * " + new_ctx + " = (void *)" + + arg->getName().str() + "->" + + string(calling_conv_regs[0]) + ";"; + } else { +-- +2.38.1 + diff --git a/bcc-0.25.0-Fix-get_kprobe_functions.patch b/bcc-0.25.0-Fix-get_kprobe_functions.patch new file mode 100644 index 0000000..b3b224f --- /dev/null +++ b/bcc-0.25.0-Fix-get_kprobe_functions.patch @@ -0,0 +1,30 @@ +From c27899b15bca6188d34c0b87b3389eeda2a90cb5 Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Mon, 9 Jan 2023 18:17:20 +0100 +Subject: [PATCH] Fix get_kprobe_functions + +get_kprobe_functions will not only return a function that matches the +regular expression, but also any function that starts with a +substrings that matches it. This is obviously not the intended +behavior. +The issue is easily fixed by replacing re.match by re.fullmatch. +--- + src/python/bcc/__init__.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py +index 7175b98e..970ddcc2 100644 +--- a/src/python/bcc/__init__.py ++++ b/src/python/bcc/__init__.py +@@ -745,7 +745,7 @@ DEBUG_BTF = 0x20 + # Exclude all gcc 8's extra .cold functions + elif re.match(b'^.*\.cold(\.\d+)?$', fn): + continue +- if (t.lower() in [b't', b'w']) and re.match(event_re, fn) \ ++ if (t.lower() in [b't', b'w']) and re.fullmatch(event_re, fn) \ + and fn not in blacklist: + fns.append(fn) + return set(fns) # Some functions may appear more than once +-- +2.38.1 + diff --git a/bcc-0.19.0-Manpages-remove-unstable-statement.patch b/bcc-0.25.0-Manpages-remove-unstable-statement.patch similarity index 89% rename from bcc-0.19.0-Manpages-remove-unstable-statement.patch rename to bcc-0.25.0-Manpages-remove-unstable-statement.patch index dc84ec9..904bff1 100644 --- a/bcc-0.19.0-Manpages-remove-unstable-statement.patch +++ b/bcc-0.25.0-Manpages-remove-unstable-statement.patch @@ -1,7 +1,7 @@ -From f4e66f50d1af5b5dd0dc1dbb261f911514c465ed Mon Sep 17 00:00:00 2001 +From ce407eab95f96badd1879c9c7342eeabcfd17311 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 6 Aug 2019 14:44:33 +0200 -Subject: [PATCH] Manpages: remove unstable statement +Subject: [PATCH 1/3] Manpages: remove unstable statement The tools are tested before each release. They should be stable enough. @@ -11,6 +11,7 @@ enough. man/man8/bindsnoop.8 | 2 -- man/man8/biolatency.8 | 2 -- man/man8/biolatpcts.8 | 2 -- + man/man8/biopattern.8 | 2 -- man/man8/biosnoop.8 | 2 -- man/man8/biotop.8 | 2 -- man/man8/bitesize.8 | 2 -- @@ -48,6 +49,7 @@ enough. man/man8/inject.8 | 4 ---- man/man8/killsnoop.8 | 2 -- man/man8/klockstat.8 | 2 -- + man/man8/kvmexit.8 | 2 -- man/man8/llcstat.8 | 2 -- man/man8/mdflush.8 | 2 -- man/man8/memleak.8 | 2 -- @@ -80,6 +82,7 @@ enough. man/man8/syncsnoop.8 | 2 -- man/man8/syscount.8 | 2 -- man/man8/tcpaccept.8 | 2 -- + man/man8/tcpcong.8 | 2 -- man/man8/tcpconnect.8 | 2 -- man/man8/tcpconnlat.8 | 2 -- man/man8/tcpdrop.8 | 2 -- @@ -109,13 +112,13 @@ enough. man/man8/xfsslower.8 | 2 -- man/man8/zfsdist.8 | 2 -- man/man8/zfsslower.8 | 2 -- - 103 files changed, 208 deletions(-) + 106 files changed, 214 deletions(-) diff --git a/man/man8/argdist.8 b/man/man8/argdist.8 -index 4116cd4d..aa128d2d 100644 +index 75b7fe63..99291521 100644 --- a/man/man8/argdist.8 +++ b/man/man8/argdist.8 -@@ -188,7 +188,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -191,7 +191,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -150,17 +153,17 @@ index f8fa1850..05ed95a7 100644 Pavel Dubovitsky .SH SEE ALSO diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8 -index c303eec0..895deb31 100644 +index db2ef484..21c3930e 100644 --- a/man/man8/biolatency.8 +++ b/man/man8/biolatency.8 -@@ -98,8 +98,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -108,8 +108,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Brendan Gregg + Brendan Gregg, Rocky Xing .SH SEE ALSO diff --git a/man/man8/biolatpcts.8 b/man/man8/biolatpcts.8 index 7a06a11d..9dd41d08 100644 @@ -175,34 +178,47 @@ index 7a06a11d..9dd41d08 100644 .SH AUTHOR Tejun Heo .SH SEE ALSO +diff --git a/man/man8/biopattern.8 b/man/man8/biopattern.8 +index 451d667f..70ba73e7 100644 +--- a/man/man8/biopattern.8 ++++ b/man/man8/biopattern.8 +@@ -70,8 +70,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing + example usage, output, and commentary for this tool. + .SH OS + Linux +-.SH STABILITY +-Unstable - in development. + .SH AUTHOR + Rocky Xing + .SH SEE ALSO diff --git a/man/man8/biosnoop.8 b/man/man8/biosnoop.8 -index 4c073f76..2b2f7434 100644 +index 24f19edf..06a85757 100644 --- a/man/man8/biosnoop.8 +++ b/man/man8/biosnoop.8 -@@ -79,8 +79,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -82,8 +82,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Brendan Gregg + Brendan Gregg, Rocky Xing .SH SEE ALSO diff --git a/man/man8/biotop.8 b/man/man8/biotop.8 -index ed25521f..b00bf68a 100644 +index 47392bc7..ac71d34e 100644 --- a/man/man8/biotop.8 +++ b/man/man8/biotop.8 -@@ -95,8 +95,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -98,8 +98,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Brendan Gregg + Brendan Gregg, Rocky Xing .SH INSPIRATION diff --git a/man/man8/bitesize.8 b/man/man8/bitesize.8 -index 99cdbaab..529a1e26 100644 +index 655f69e7..314560b8 100644 --- a/man/man8/bitesize.8 +++ b/man/man8/bitesize.8 @@ -43,8 +43,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing @@ -278,17 +294,17 @@ index 172194d4..575941fc 100644 Allan McAleavy .SH SEE ALSO diff --git a/man/man8/cachetop.8 b/man/man8/cachetop.8 -index 5642fa1d..1fe4067b 100644 +index bb7bb3cc..079b3cbf 100644 --- a/man/man8/cachetop.8 +++ b/man/man8/cachetop.8 -@@ -83,8 +83,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -90,8 +90,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Emmanuel Bretelle + Emmanuel Bretelle, Rocky Xing .SH SEE ALSO diff --git a/man/man8/capable.8 b/man/man8/capable.8 index 342946f8..2b7d13ba 100644 @@ -316,17 +332,17 @@ index a2933d7a..c1339579 100644 .SH AUTHOR Wenbo Zhang diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8 -index 6ee1f3bf..979f0451 100644 +index 59937baa..1d6a7198 100644 --- a/man/man8/cpudist.8 +++ b/man/man8/cpudist.8 -@@ -99,8 +99,6 @@ Also look in the bcc distribution for a companion _example.txt file containing +@@ -115,8 +115,6 @@ Also look in the bcc distribution for a companion _example.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Sasha Goldshtein + Sasha Goldshtein, Rocky Xing .SH SEE ALSO diff --git a/man/man8/cpuunclaimed.8 b/man/man8/cpuunclaimed.8 index 674be499..107ff67b 100644 @@ -342,10 +358,10 @@ index 674be499..107ff67b 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/criticalstat.8 b/man/man8/criticalstat.8 -index cdd5e8ed..8f918707 100644 +index 6b1c1110..6df592a3 100644 --- a/man/man8/criticalstat.8 +++ b/man/man8/criticalstat.8 -@@ -65,8 +65,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -66,8 +66,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -407,10 +423,10 @@ index e2bc4dc8..0501460f 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/deadlock.8 b/man/man8/deadlock.8 -index 0be3f4ab..7300446a 100644 +index 3e7744ce..9f0f7de0 100644 --- a/man/man8/deadlock.8 +++ b/man/man8/deadlock.8 -@@ -136,7 +136,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -144,7 +144,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -432,10 +448,10 @@ index cc61a676..9a0fd3d9 100644 Erwan Velu .SH INSPIRATION diff --git a/man/man8/drsnoop.8 b/man/man8/drsnoop.8 -index 572c0dce..3563da81 100644 +index 90ca901f..64cc1c68 100644 --- a/man/man8/drsnoop.8 +++ b/man/man8/drsnoop.8 -@@ -104,7 +104,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -107,7 +107,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -522,7 +538,7 @@ index fe912436..d567986b 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/filetop.8 b/man/man8/filetop.8 -index ba0cbd6e..1f818d68 100644 +index 2d5f191d..6b588ef9 100644 --- a/man/man8/filetop.8 +++ b/man/man8/filetop.8 @@ -111,8 +111,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing @@ -535,7 +551,7 @@ index ba0cbd6e..1f818d68 100644 Brendan Gregg .SH INSPIRATION diff --git a/man/man8/funccount.8 b/man/man8/funccount.8 -index 16ce4fc0..1c7b71c1 100644 +index b2cb8575..9e019d92 100644 --- a/man/man8/funccount.8 +++ b/man/man8/funccount.8 @@ -104,8 +104,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing @@ -548,10 +564,10 @@ index 16ce4fc0..1c7b71c1 100644 Brendan Gregg, Sasha Goldshtein .SH SEE ALSO diff --git a/man/man8/funcinterval.8 b/man/man8/funcinterval.8 -index 89a4a7b4..c9e1f31a 100755 +index 77128290..f76ff315 100644 --- a/man/man8/funcinterval.8 +++ b/man/man8/funcinterval.8 -@@ -106,8 +106,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -114,8 +114,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -561,10 +577,10 @@ index 89a4a7b4..c9e1f31a 100755 Edward Wu .SH SEE ALSO diff --git a/man/man8/funclatency.8 b/man/man8/funclatency.8 -index b82626cf..3c8e248e 100644 +index 9012b832..088b26cd 100644 --- a/man/man8/funclatency.8 +++ b/man/man8/funclatency.8 -@@ -127,8 +127,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -131,8 +131,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -600,17 +616,17 @@ index a9d18e07..876f3983 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/hardirqs.8 b/man/man8/hardirqs.8 -index 8e7237a9..9e6975c9 100644 +index aa9afb84..e74d56b3 100644 --- a/man/man8/hardirqs.8 +++ b/man/man8/hardirqs.8 -@@ -87,8 +87,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -95,8 +95,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Brendan Gregg + Brendan Gregg, Hengqi Chen, Rocky Xing .SH SEE ALSO diff --git a/man/man8/inject.8 b/man/man8/inject.8 index 2ab80dbb..85b36b6e 100644 @@ -652,11 +668,23 @@ index 0a3167d1..57f9db18 100644 .SH CREDITS This tool is based on work of David Valin and his script. .SH AUTHOR +diff --git a/man/man8/kvmexit.8 b/man/man8/kvmexit.8 +index c0cb4c98..0b56cecc 100644 +--- a/man/man8/kvmexit.8 ++++ b/man/man8/kvmexit.8 +@@ -109,7 +109,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing + example usage, output, and commentary for this tool. + .SH OS + Linux +-.SH STABILITY +-Unstable - in development. + .SH AUTHOR + Fei Li diff --git a/man/man8/llcstat.8 b/man/man8/llcstat.8 -index 36dbed7d..2b554c46 100644 +index 5a28d338..7440fe3b 100644 --- a/man/man8/llcstat.8 +++ b/man/man8/llcstat.8 -@@ -63,8 +63,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -66,8 +66,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -666,7 +694,7 @@ index 36dbed7d..2b554c46 100644 Teng Qin .SH SEE ALSO diff --git a/man/man8/mdflush.8 b/man/man8/mdflush.8 -index 9d10ca87..3c3b9bba 100644 +index e22c46b3..66939b25 100644 --- a/man/man8/mdflush.8 +++ b/man/man8/mdflush.8 @@ -49,8 +49,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing @@ -768,10 +796,10 @@ index be5387b9..270991d8 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/offwaketime.8 b/man/man8/offwaketime.8 -index 7334b6f8..f0704e92 100644 +index 44e3b684..4f4880e3 100644 --- a/man/man8/offwaketime.8 +++ b/man/man8/offwaketime.8 -@@ -107,8 +107,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -110,8 +110,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -820,7 +848,7 @@ index 2164ffaf..9d272c81 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/profile.8 b/man/man8/profile.8 -index 30871afe..8523a731 100644 +index 916224a9..6339dbfa 100644 --- a/man/man8/profile.8 +++ b/man/man8/profile.8 @@ -148,8 +148,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing @@ -884,10 +912,10 @@ index b36a5a18..c2f0ffb4 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/runqslower.8 b/man/man8/runqslower.8 -index f8b5f008..c72389d7 100644 +index 55ea5bd9..dbbec5ae 100644 --- a/man/man8/runqslower.8 +++ b/man/man8/runqslower.8 -@@ -81,8 +81,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -84,8 +84,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -933,17 +961,17 @@ index cd3ffa27..94432544 100644 -.SH STABILITY -Unstable - in development. diff --git a/man/man8/softirqs.8 b/man/man8/softirqs.8 -index a9a14414..df2fc90b 100644 +index fa475f78..311fd0e6 100644 --- a/man/man8/softirqs.8 +++ b/man/man8/softirqs.8 -@@ -85,8 +85,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -99,8 +99,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHORS - Brendan Gregg, Sasha Goldshtein + Brendan Gregg, Sasha Goldshtein, Rocky Xing .SH SEE ALSO diff --git a/man/man8/solisten.8 b/man/man8/solisten.8 index 4d8ffe95..fd863447 100644 @@ -969,10 +997,10 @@ index ffad57c5..df80437f 100644 Jiri Olsa .SH SEE ALSO diff --git a/man/man8/sslsniff.8 b/man/man8/sslsniff.8 -index 7b945b00..b23f4c68 100644 +index 4b80191a..86219f8a 100644 --- a/man/man8/sslsniff.8 +++ b/man/man8/sslsniff.8 -@@ -43,8 +43,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -108,8 +108,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1008,10 +1036,10 @@ index c0555043..d1560b3c 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/swapin.8 b/man/man8/swapin.8 -index c5ef1ffc..89600460 100644 +index 9ef2eb9c..7347899d 100644 --- a/man/man8/swapin.8 +++ b/man/man8/swapin.8 -@@ -50,8 +50,6 @@ Also look in the bcc distribution for a companion _examples.txt file +@@ -56,8 +56,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1034,23 +1062,23 @@ index 8543f213..cd7b0629 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/syscount.8 b/man/man8/syscount.8 -index d13793be..88343e14 100644 +index 8c245dd8..54f9fe66 100644 --- a/man/man8/syscount.8 +++ b/man/man8/syscount.8 -@@ -102,8 +102,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -108,8 +108,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux -.SH STABILITY -Unstable - in development. .SH AUTHOR - Sasha Goldshtein + Sasha Goldshtein, Rocky Xing .SH SEE ALSO diff --git a/man/man8/tcpaccept.8 b/man/man8/tcpaccept.8 -index 603a5ca4..4e7dfc9e 100644 +index 05b79693..6dd35301 100644 --- a/man/man8/tcpaccept.8 +++ b/man/man8/tcpaccept.8 -@@ -104,8 +104,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -118,8 +118,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1059,11 +1087,24 @@ index 603a5ca4..4e7dfc9e 100644 .SH AUTHOR Brendan Gregg .SH SEE ALSO +diff --git a/man/man8/tcpcong.8 b/man/man8/tcpcong.8 +index 877ed805..3fd0a253 100644 +--- a/man/man8/tcpcong.8 ++++ b/man/man8/tcpcong.8 +@@ -128,8 +128,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing + example usage, output, and commentary for this tool. + .SH OS + Linux +-.SH STABILITY +-Unstable - in development. + .SH AUTHOR + jacky gan + .SH SEE ALSO diff --git a/man/man8/tcpconnect.8 b/man/man8/tcpconnect.8 -index 55105709..7c1740a4 100644 +index 0ea84686..04952d74 100644 --- a/man/man8/tcpconnect.8 +++ b/man/man8/tcpconnect.8 -@@ -183,8 +183,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -197,8 +197,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1073,10 +1114,10 @@ index 55105709..7c1740a4 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcpconnlat.8 b/man/man8/tcpconnlat.8 -index 9c810071..236f0fb0 100644 +index 84762b45..3771baa7 100644 --- a/man/man8/tcpconnlat.8 +++ b/man/man8/tcpconnlat.8 -@@ -111,8 +111,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -125,8 +125,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1086,10 +1127,10 @@ index 9c810071..236f0fb0 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcpdrop.8 b/man/man8/tcpdrop.8 -index 12806472..a2e2860c 100644 +index c9b777b3..82e400eb 100644 --- a/man/man8/tcpdrop.8 +++ b/man/man8/tcpdrop.8 -@@ -65,8 +65,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -83,8 +83,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1099,10 +1140,10 @@ index 12806472..a2e2860c 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcplife.8 b/man/man8/tcplife.8 -index a2419c61..a2a2550c 100644 +index 5fb4c283..cc76d870 100644 --- a/man/man8/tcplife.8 +++ b/man/man8/tcplife.8 -@@ -122,8 +122,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -136,8 +136,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1112,10 +1153,10 @@ index a2419c61..a2a2550c 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcpretrans.8 b/man/man8/tcpretrans.8 -index 0ac82afa..04226dc6 100644 +index 0b643d11..aebdab73 100644 --- a/man/man8/tcpretrans.8 +++ b/man/man8/tcpretrans.8 -@@ -83,8 +83,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -103,8 +103,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1125,10 +1166,10 @@ index 0ac82afa..04226dc6 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcprtt.8 b/man/man8/tcprtt.8 -index 729a1abb..e934102a 100644 +index fcd8bfe9..4ee6b8b5 100644 --- a/man/man8/tcprtt.8 +++ b/man/man8/tcprtt.8 -@@ -78,8 +78,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -95,8 +95,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1138,10 +1179,10 @@ index 729a1abb..e934102a 100644 zhenwei pi .SH SEE ALSO diff --git a/man/man8/tcpstates.8 b/man/man8/tcpstates.8 -index 26c7a8a1..d93c198d 100644 +index 57c40a83..1d6b8a29 100644 --- a/man/man8/tcpstates.8 +++ b/man/man8/tcpstates.8 -@@ -123,8 +123,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -137,8 +137,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1164,10 +1205,10 @@ index 525b8082..daf41583 100644 Rodrigo Manyari .SH INSPIRATION diff --git a/man/man8/tcpsynbl.8 b/man/man8/tcpsynbl.8 -index 4dd38c8a..93c353b1 100644 +index 8557af2b..c03c5460 100644 --- a/man/man8/tcpsynbl.8 +++ b/man/man8/tcpsynbl.8 -@@ -51,8 +51,6 @@ Also look in the bcc distribution for a companion _examples.txt file +@@ -69,8 +69,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1177,10 +1218,10 @@ index 4dd38c8a..93c353b1 100644 Brendan Gregg .SH SEE ALSO diff --git a/man/man8/tcptop.8 b/man/man8/tcptop.8 -index e636f456..633b0c93 100644 +index f4f1c3d7..013cb3d9 100644 --- a/man/man8/tcptop.8 +++ b/man/man8/tcptop.8 -@@ -113,8 +113,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -127,8 +127,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1190,10 +1231,10 @@ index e636f456..633b0c93 100644 Brendan Gregg .SH INSPIRATION diff --git a/man/man8/tcptracer.8 b/man/man8/tcptracer.8 -index d2346c77..893604d6 100644 +index 59240f4b..331efe9a 100644 --- a/man/man8/tcptracer.8 +++ b/man/man8/tcptracer.8 -@@ -99,8 +99,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -113,8 +113,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1228,10 +1269,10 @@ index da5edf37..f7e459da 100644 .SH AUTHOR Sasha Goldshtein diff --git a/man/man8/trace.8 b/man/man8/trace.8 -index e4f06fc7..7b68d833 100644 +index c4417e5f..bd1b11b2 100644 --- a/man/man8/trace.8 +++ b/man/man8/trace.8 -@@ -210,7 +210,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -217,7 +217,5 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1240,10 +1281,10 @@ index e4f06fc7..7b68d833 100644 .SH AUTHOR Sasha Goldshtein diff --git a/man/man8/ttysnoop.8 b/man/man8/ttysnoop.8 -index 9f37aaa9..36eb75d1 100644 +index e2ec037f..e20feb01 100644 --- a/man/man8/ttysnoop.8 +++ b/man/man8/ttysnoop.8 -@@ -52,8 +52,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing +@@ -58,8 +58,6 @@ Also look in the bcc distribution for a companion _examples.txt file containing example usage, output, and commentary for this tool. .SH OS Linux @@ -1435,5 +1476,5 @@ index d1e2f9c1..31d382f6 100644 Brendan Gregg .SH SEE ALSO -- -2.30.2 +2.38.1 diff --git a/bcc-0.25.0-RHEL-libbpf-version-fixes.patch b/bcc-0.25.0-RHEL-libbpf-version-fixes.patch new file mode 100644 index 0000000..fa08b57 --- /dev/null +++ b/bcc-0.25.0-RHEL-libbpf-version-fixes.patch @@ -0,0 +1,684 @@ +From 66d9bffa99738bbed50b3d5b2d87990cdb5e4a58 Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Mon, 28 Nov 2022 12:23:59 +0100 +Subject: [PATCH] RHEL: libbpf version fixes + +Revert "[bcc] stop using deprecated `bpf_load_program_attr`" +Revert "backport `struct bpf_create_map_attr`" +Revert "bcc: Replace deprecated libbpf API" + +Revert "bcc: Replace deprecated libbpf APIs" since the libbpf version +provided in RHEL 8 doesn't provide the new APIs. + +Remove BPF_MAP_TYPE_BLOOM_FILTER from bps since the libbpf version in +RHEL 8, doesn't provide bloom filter map. + +Rename btf__load_vmlinux_btf into libbpf_find_kernel_btf. The function +has been renamed upstream for naming consistency, but RHEL 8 libbpf +still uses the old name. + +Also use the older btf__get_nr_types() instead of btf__type_cnt() for +the same reason. + +Add definition of struct bpf_core_relo. +--- + introspection/bps.c | 1 - + libbpf-tools/ksnoop.c | 4 +- + src/cc/bcc_btf.cc | 73 +++++++++++++++- + src/cc/bpf_module.cc | 38 ++++---- + src/cc/common.cc | 4 +- + src/cc/libbpf.c | 196 +++++++++++++++--------------------------- + src/cc/libbpf.h | 28 ++---- + 7 files changed, 169 insertions(+), 175 deletions(-) + +diff --git a/introspection/bps.c b/introspection/bps.c +index 232b23d4..6ec02e6c 100644 +--- a/introspection/bps.c ++++ b/introspection/bps.c +@@ -80,7 +80,6 @@ static const char * const map_type_strings[] = { + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", +- [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + }; + + #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +diff --git a/libbpf-tools/ksnoop.c b/libbpf-tools/ksnoop.c +index 87fe175c..960e901b 100644 +--- a/libbpf-tools/ksnoop.c ++++ b/libbpf-tools/ksnoop.c +@@ -347,7 +347,7 @@ static struct btf *get_btf(const char *name) + name && strlen(name) > 0 ? name : "vmlinux"); + + if (!vmlinux_btf) { +- vmlinux_btf = btf__load_vmlinux_btf(); ++ vmlinux_btf = libbpf_find_kernel_btf(); + if (!vmlinux_btf) { + err = -errno; + p_err("No BTF, cannot determine type info: %s", strerror(-err)); +@@ -357,7 +357,7 @@ static struct btf *get_btf(const char *name) + if (!name || strlen(name) == 0) + return vmlinux_btf; + +- mod_btf = btf__load_module_btf(name, vmlinux_btf); ++ mod_btf = libbpf_find_kernel_btf(name, vmlinux_btf); + if (!mod_btf) { + err = -errno; + p_err("No BTF for module '%s': %s", name, strerror(-err)); +diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc +index be248612..74fc902c 100644 +--- a/src/cc/bcc_btf.cc ++++ b/src/cc/bcc_btf.cc +@@ -170,6 +170,77 @@ static int btf_ext_setup_line_info(struct btf_ext *btf_ext) + return btf_ext_setup_info(btf_ext, ¶m); + } + ++/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value ++ * has to be adjusted by relocations. ++ */ ++enum bpf_core_relo_kind { ++ BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ ++ BPF_FIELD_BYTE_SIZE = 1, /* field size in bytes */ ++ BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ ++ BPF_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ ++ BPF_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ ++ BPF_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ ++ BPF_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ ++ BPF_TYPE_ID_TARGET = 7, /* type ID in target kernel */ ++ BPF_TYPE_EXISTS = 8, /* type existence in target kernel */ ++ BPF_TYPE_SIZE = 9, /* type size in bytes */ ++ BPF_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ ++ BPF_ENUMVAL_VALUE = 11, /* enum value integer value */ ++}; ++ ++/* The minimum bpf_core_relo checked by the loader ++ * ++ * CO-RE relocation captures the following data: ++ * - insn_off - instruction offset (in bytes) within a BPF program that needs ++ * its insn->imm field to be relocated with actual field info; ++ * - type_id - BTF type ID of the "root" (containing) entity of a relocatable ++ * type or field; ++ * - access_str_off - offset into corresponding .BTF string section. String ++ * interpretation depends on specific relocation kind: ++ * - for field-based relocations, string encodes an accessed field using ++ * a sequence of field and array indices, separated by colon (:). It's ++ * conceptually very close to LLVM's getelementptr ([0]) instruction's ++ * arguments for identifying offset to a field. ++ * - for type-based relocations, strings is expected to be just "0"; ++ * - for enum value-based relocations, string contains an index of enum ++ * value within its enum type; ++ * ++ * Example to provide a better feel. ++ * ++ * struct sample { ++ * int a; ++ * struct { ++ * int b[10]; ++ * }; ++ * }; ++ * ++ * struct sample *s = ...; ++ * int x = &s->a; // encoded as "0:0" (a is field #0) ++ * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, ++ * // b is field #0 inside anon struct, accessing elem #5) ++ * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) ++ * ++ * type_id for all relocs in this example will capture BTF type id of ++ * `struct sample`. ++ * ++ * Such relocation is emitted when using __builtin_preserve_access_index() ++ * Clang built-in, passing expression that captures field address, e.g.: ++ * ++ * bpf_probe_read(&dst, sizeof(dst), ++ * __builtin_preserve_access_index(&src->a.b.c)); ++ * ++ * In this case Clang will emit field relocation recording necessary data to ++ * be able to find offset of embedded `a.b.c` field within `src` struct. ++ * ++ * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction ++ */ ++struct bpf_core_relo { ++ __u32 insn_off; ++ __u32 type_id; ++ __u32 access_str_off; ++ enum bpf_core_relo_kind kind; ++}; ++ + static int btf_ext_setup_core_relos(struct btf_ext *btf_ext) + { + struct btf_ext_sec_setup_param param = { +@@ -597,7 +668,7 @@ int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, + return -1; + } + +- if (btf__load_into_kernel(btf)) { ++ if (btf__load(btf)) { + btf__free(btf); + warning("Loading .BTF section failed\n"); + return -1; +diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc +index 86f6a228..490fffe8 100644 +--- a/src/cc/bpf_module.cc ++++ b/src/cc/bpf_module.cc +@@ -407,7 +407,7 @@ int BPFModule::create_maps(std::map> &map_tids, + } + + if (pinned_id <= 0) { +- struct bcc_create_map_attr attr = {}; ++ struct bpf_create_map_attr attr = {}; + attr.map_type = (enum bpf_map_type)map_type; + attr.name = map_name; + attr.key_size = key_size; +@@ -982,22 +982,26 @@ int BPFModule::bcc_func_load(int prog_type, const char *name, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size, + const char *dev_name, unsigned flags, int expected_attach_type) { +- struct bpf_prog_load_opts opts = {}; ++ struct bpf_load_program_attr attr = {}; + unsigned func_info_cnt, line_info_cnt, finfo_rec_size, linfo_rec_size; + void *func_info = NULL, *line_info = NULL; + int ret; + ++ attr.prog_type = (enum bpf_prog_type)prog_type; + if (expected_attach_type != -1) { +- opts.expected_attach_type = (enum bpf_attach_type)expected_attach_type; ++ attr.expected_attach_type = (enum bpf_attach_type)expected_attach_type; + } +- if (prog_type != BPF_PROG_TYPE_TRACING && +- prog_type != BPF_PROG_TYPE_EXT) { +- opts.kern_version = kern_version; ++ attr.name = name; ++ attr.insns = insns; ++ attr.license = license; ++ if (attr.prog_type != BPF_PROG_TYPE_TRACING && ++ attr.prog_type != BPF_PROG_TYPE_EXT) { ++ attr.kern_version = kern_version; + } +- opts.prog_flags = flags; +- opts.log_level = log_level; ++ attr.prog_flags = flags; ++ attr.log_level = log_level; + if (dev_name) +- opts.prog_ifindex = if_nametoindex(dev_name); ++ attr.prog_ifindex = if_nametoindex(dev_name); + + if (btf_) { + int btf_fd = btf_->get_fd(); +@@ -1008,17 +1012,17 @@ int BPFModule::bcc_func_load(int prog_type, const char *name, + &finfo_rec_size, &line_info, + &line_info_cnt, &linfo_rec_size); + if (!ret) { +- opts.prog_btf_fd = btf_fd; +- opts.func_info = func_info; +- opts.func_info_cnt = func_info_cnt; +- opts.func_info_rec_size = finfo_rec_size; +- opts.line_info = line_info; +- opts.line_info_cnt = line_info_cnt; +- opts.line_info_rec_size = linfo_rec_size; ++ attr.prog_btf_fd = btf_fd; ++ attr.func_info = func_info; ++ attr.func_info_cnt = func_info_cnt; ++ attr.func_info_rec_size = finfo_rec_size; ++ attr.line_info = line_info; ++ attr.line_info_cnt = line_info_cnt; ++ attr.line_info_rec_size = linfo_rec_size; + } + } + +- ret = bcc_prog_load_xattr((enum bpf_prog_type)prog_type, name, license, insns, &opts, prog_len, log_buf, log_buf_size, allow_rlimit_); ++ ret = bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size, allow_rlimit_); + if (btf_) { + free(func_info); + free(line_info); +diff --git a/src/cc/common.cc b/src/cc/common.cc +index 3143adb0..11970275 100644 +--- a/src/cc/common.cc ++++ b/src/cc/common.cc +@@ -34,11 +34,11 @@ using std::experimental::optional; + static optional get_enum_val_from_btf(const char *name) { + optional val; + +- auto btf = btf__load_vmlinux_btf(); ++ auto btf = libbpf_find_kernel_btf(); + if (libbpf_get_error(btf)) + return {}; + +- for (size_t i = 1; i < btf__type_cnt(btf); i++) { ++ for (size_t i = 1; i <= btf__get_nr_types(btf); i++) { + auto t = btf__type_by_id(btf, i); + if (btf_kind(t) != BTF_KIND_ENUM) + continue; +diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c +index 0c09f9b3..7042c792 100644 +--- a/src/cc/libbpf.c ++++ b/src/cc/libbpf.c +@@ -319,33 +319,14 @@ static uint64_t ptr_to_u64(void *ptr) + return (uint64_t) (unsigned long) ptr; + } + +-static int libbpf_bpf_map_create(struct bcc_create_map_attr *create_attr) +-{ +- LIBBPF_OPTS(bpf_map_create_opts, p); +- +- p.map_flags = create_attr->map_flags; +- p.numa_node = create_attr->numa_node; +- p.btf_fd = create_attr->btf_fd; +- p.btf_key_type_id = create_attr->btf_key_type_id; +- p.btf_value_type_id = create_attr->btf_value_type_id; +- p.map_ifindex = create_attr->map_ifindex; +- if (create_attr->map_type == BPF_MAP_TYPE_STRUCT_OPS) +- p.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id; +- else +- p.inner_map_fd = create_attr->inner_map_fd; +- +- return bpf_map_create(create_attr->map_type, create_attr->name, create_attr->key_size, +- create_attr->value_size, create_attr->max_entries, &p); +-} +- +-int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit) ++int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit) + { + unsigned name_len = attr->name ? strlen(attr->name) : 0; + char map_name[BPF_OBJ_NAME_LEN] = {}; + + memcpy(map_name, attr->name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + attr->name = map_name; +- int ret = libbpf_bpf_map_create(attr); ++ int ret = bpf_create_map_xattr(attr); + + if (ret < 0 && errno == EPERM) { + if (!allow_rlimit) +@@ -357,7 +338,7 @@ int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit) + rl.rlim_max = RLIM_INFINITY; + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) +- ret = libbpf_bpf_map_create(attr); ++ ret = bpf_create_map_xattr(attr); + } + } + +@@ -367,12 +348,12 @@ int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit) + attr->btf_fd = 0; + attr->btf_key_type_id = 0; + attr->btf_value_type_id = 0; +- ret = libbpf_bpf_map_create(attr); ++ ret = bpf_create_map_xattr(attr); + } + + if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { + map_name[0] = '\0'; +- ret = libbpf_bpf_map_create(attr); ++ ret = bpf_create_map_xattr(attr); + } + + if (ret < 0 && errno == EPERM) { +@@ -385,7 +366,7 @@ int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit) + rl.rlim_max = RLIM_INFINITY; + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) +- ret = libbpf_bpf_map_create(attr); ++ ret = bpf_create_map_xattr(attr); + } + } + return ret; +@@ -395,7 +376,7 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, int map_flags) + { +- struct bcc_create_map_attr attr = {}; ++ struct bpf_create_map_attr attr = {}; + + attr.map_type = map_type; + attr.name = name; +@@ -644,70 +625,24 @@ int bpf_prog_get_tag(int fd, unsigned long long *ptag) + return -2; + } + +-static int libbpf_bpf_prog_load(enum bpf_prog_type prog_type, +- const char *prog_name, const char *license, +- const struct bpf_insn *insns, size_t insn_cnt, +- struct bpf_prog_load_opts *opts, +- char *log_buf, size_t log_buf_sz) +-{ +- +- LIBBPF_OPTS(bpf_prog_load_opts, p); +- +- if (!opts || !log_buf != !log_buf_sz) { +- errno = EINVAL; +- return -EINVAL; +- } +- +- p.expected_attach_type = opts->expected_attach_type; +- switch (prog_type) { +- case BPF_PROG_TYPE_STRUCT_OPS: +- case BPF_PROG_TYPE_LSM: +- p.attach_btf_id = opts->attach_btf_id; +- break; +- case BPF_PROG_TYPE_TRACING: +- case BPF_PROG_TYPE_EXT: +- p.attach_btf_id = opts->attach_btf_id; +- p.attach_prog_fd = opts->attach_prog_fd; +- break; +- default: +- p.prog_ifindex = opts->prog_ifindex; +- p.kern_version = opts->kern_version; +- } +- p.log_level = opts->log_level; +- p.log_buf = log_buf; +- p.log_size = log_buf_sz; +- p.prog_btf_fd = opts->prog_btf_fd; +- p.func_info_rec_size = opts->func_info_rec_size; +- p.func_info_cnt = opts->func_info_cnt; +- p.func_info = opts->func_info; +- p.line_info_rec_size = opts->line_info_rec_size; +- p.line_info_cnt = opts->line_info_cnt; +- p.line_info = opts->line_info; +- p.prog_flags = opts->prog_flags; +- +- return bpf_prog_load(prog_type, prog_name, license, +- insns, insn_cnt, &p); +-} +- +-int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, +- const char *license, const struct bpf_insn *insns, +- struct bpf_prog_load_opts *opts, int prog_len, ++int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, + char *log_buf, unsigned log_buf_size, bool allow_rlimit) + { +- unsigned name_len = prog_name ? strlen(prog_name) : 0; +- char *tmp_log_buf = NULL, *opts_log_buf = NULL; +- unsigned tmp_log_buf_size = 0, opts_log_buf_size = 0; ++ unsigned name_len = attr->name ? strlen(attr->name) : 0; ++ char *tmp_log_buf = NULL, *attr_log_buf = NULL; ++ unsigned tmp_log_buf_size = 0, attr_log_buf_size = 0; + int ret = 0, name_offset = 0, expected_attach_type = 0; +- char new_prog_name[BPF_OBJ_NAME_LEN] = {}; ++ char prog_name[BPF_OBJ_NAME_LEN] = {}; + + unsigned insns_cnt = prog_len / sizeof(struct bpf_insn); ++ attr->insns_cnt = insns_cnt; + +- if (opts->log_level > 0) { ++ if (attr->log_level > 0) { + if (log_buf_size > 0) { + // Use user-provided log buffer if available. + log_buf[0] = 0; +- opts_log_buf = log_buf; +- opts_log_buf_size = log_buf_size; ++ attr_log_buf = log_buf; ++ attr_log_buf_size = log_buf_size; + } else { + // Create and use temporary log buffer if user didn't provide one. + tmp_log_buf_size = LOG_BUF_SIZE; +@@ -715,82 +650,82 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + if (!tmp_log_buf) { + fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n", + strerror(errno)); +- opts->log_level = 0; ++ attr->log_level = 0; + } else { + tmp_log_buf[0] = 0; +- opts_log_buf = tmp_log_buf; +- opts_log_buf_size = tmp_log_buf_size; ++ attr_log_buf = tmp_log_buf; ++ attr_log_buf_size = tmp_log_buf_size; + } + } + } + +- + if (name_len) { +- if (strncmp(prog_name, "kprobe__", 8) == 0) ++ if (strncmp(attr->name, "kprobe__", 8) == 0) + name_offset = 8; +- else if (strncmp(prog_name, "kretprobe__", 11) == 0) ++ else if (strncmp(attr->name, "kretprobe__", 11) == 0) + name_offset = 11; +- else if (strncmp(prog_name, "tracepoint__", 12) == 0) ++ else if (strncmp(attr->name, "tracepoint__", 12) == 0) + name_offset = 12; +- else if (strncmp(prog_name, "raw_tracepoint__", 16) == 0) ++ else if (strncmp(attr->name, "raw_tracepoint__", 16) == 0) + name_offset = 16; +- else if (strncmp(prog_name, "kfunc__", 7) == 0) { ++ else if (strncmp(attr->name, "kfunc__", 7) == 0) { + name_offset = 7; + expected_attach_type = BPF_TRACE_FENTRY; +- } else if (strncmp(prog_name, "kmod_ret__", 10) == 0) { ++ } else if (strncmp(attr->name, "kmod_ret__", 10) == 0) { + name_offset = 10; + expected_attach_type = BPF_MODIFY_RETURN; +- } else if (strncmp(prog_name, "kretfunc__", 10) == 0) { ++ } else if (strncmp(attr->name, "kretfunc__", 10) == 0) { + name_offset = 10; + expected_attach_type = BPF_TRACE_FEXIT; +- } else if (strncmp(prog_name, "lsm__", 5) == 0) { ++ } else if (strncmp(attr->name, "lsm__", 5) == 0) { + name_offset = 5; + expected_attach_type = BPF_LSM_MAC; +- } else if (strncmp(prog_name, "bpf_iter__", 10) == 0) { ++ } else if (strncmp(attr->name, "bpf_iter__", 10) == 0) { + name_offset = 10; + expected_attach_type = BPF_TRACE_ITER; + } + +- if (prog_type == BPF_PROG_TYPE_TRACING || +- prog_type == BPF_PROG_TYPE_LSM) { +- ret = libbpf_find_vmlinux_btf_id(prog_name + name_offset, ++ if (attr->prog_type == BPF_PROG_TYPE_TRACING || ++ attr->prog_type == BPF_PROG_TYPE_LSM) { ++ ret = libbpf_find_vmlinux_btf_id(attr->name + name_offset, + expected_attach_type); + if (ret == -EINVAL) { + fprintf(stderr, "bpf: vmlinux BTF is not found\n"); + return ret; + } else if (ret < 0) { + fprintf(stderr, "bpf: %s is not found in vmlinux BTF\n", +- prog_name + name_offset); ++ attr->name + name_offset); + return ret; + } + +- opts->attach_btf_id = ret; +- opts->expected_attach_type = expected_attach_type; ++ attr->attach_btf_id = ret; ++ attr->expected_attach_type = expected_attach_type; + } + +- memcpy(new_prog_name, prog_name + name_offset, ++ memcpy(prog_name, attr->name + name_offset, + min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1)); ++ attr->name = prog_name; + } + +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, opts_log_buf, opts_log_buf_size); ++ ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + + // func_info/line_info may not be supported in old kernels. +- if (ret < 0 && opts->func_info && errno == EINVAL) { +- opts->prog_btf_fd = 0; +- opts->func_info = NULL; +- opts->func_info_cnt = 0; +- opts->func_info_rec_size = 0; +- opts->line_info = NULL; +- opts->line_info_cnt = 0; +- opts->line_info_rec_size = 0; +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, opts_log_buf, opts_log_buf_size); ++ if (ret < 0 && attr->func_info && errno == EINVAL) { ++ attr->prog_btf_fd = 0; ++ attr->func_info = NULL; ++ attr->func_info_cnt = 0; ++ attr->func_info_rec_size = 0; ++ attr->line_info = NULL; ++ attr->line_info_cnt = 0; ++ attr->line_info_rec_size = 0; ++ ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + } + + // BPF object name is not supported on older Kernels. + // If we failed due to this, clear the name and try again. + if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { +- new_prog_name[0] = '\0'; +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, opts_log_buf, opts_log_buf_size); ++ prog_name[0] = '\0'; ++ ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + } + + if (ret < 0 && errno == EPERM) { +@@ -809,14 +744,14 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + rl.rlim_max = RLIM_INFINITY; + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, opts_log_buf, opts_log_buf_size); ++ ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + } + } + + if (ret < 0 && errno == E2BIG) { + fprintf(stderr, + "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n", +- strerror(errno), new_prog_name, insns_cnt, BPF_MAXINSNS); ++ strerror(errno), attr->name, insns_cnt, BPF_MAXINSNS); + return -1; + } + +@@ -825,9 +760,9 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + // User has provided a log buffer. + if (log_buf_size) { + // If logging is not already enabled, enable it and do the syscall again. +- if (opts->log_level == 0) { +- opts->log_level = 1; +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, log_buf, log_buf_size); ++ if (attr->log_level == 0) { ++ attr->log_level = 1; ++ ret = bpf_load_program_xattr(attr, log_buf, log_buf_size); + } + // Print the log message and return. + bpf_print_hints(ret, log_buf); +@@ -841,8 +776,8 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + if (tmp_log_buf) + free(tmp_log_buf); + tmp_log_buf_size = LOG_BUF_SIZE; +- if (opts->log_level == 0) +- opts->log_level = 1; ++ if (attr->log_level == 0) ++ attr->log_level = 1; + for (;;) { + tmp_log_buf = malloc(tmp_log_buf_size); + if (!tmp_log_buf) { +@@ -851,7 +786,7 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + goto return_result; + } + tmp_log_buf[0] = 0; +- ret = libbpf_bpf_prog_load(prog_type, new_prog_name, license, insns, insns_cnt, opts, tmp_log_buf, tmp_log_buf_size); ++ ret = bpf_load_program_xattr(attr, tmp_log_buf, tmp_log_buf_size); + if (ret < 0 && errno == ENOSPC) { + // Temporary buffer size is not enough. Double it and try again. + free(tmp_log_buf); +@@ -865,7 +800,7 @@ int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, + + // Check if we should print the log message if log_level is not 0, + // either specified by user or set due to error. +- if (opts->log_level > 0) { ++ if (attr->log_level > 0) { + // Don't print if user enabled logging and provided log buffer, + // but there is no error. + if (log_buf && ret < 0) +@@ -885,13 +820,16 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size) + { +- struct bpf_prog_load_opts opts = {}; +- ++ struct bpf_load_program_attr attr = {}; + ++ attr.prog_type = prog_type; ++ attr.name = name; ++ attr.insns = insns; ++ attr.license = license; + if (prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) +- opts.kern_version = kern_version; +- opts.log_level = log_level; +- return bcc_prog_load_xattr(prog_type, name, license, insns, &opts, prog_len, log_buf, log_buf_size, true); ++ attr.kern_version = kern_version; ++ attr.log_level = log_level; ++ return bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size, true); + } + + int bpf_open_raw_sock(const char *name) +@@ -1388,7 +1326,7 @@ int kernel_struct_has_field(const char *struct_name, const char *field_name) + struct btf *btf; + int i, ret, btf_id; + +- btf = btf__load_vmlinux_btf(); ++ btf = libbpf_find_kernel_btf(); + ret = libbpf_get_error(btf); + if (ret) + return -1; +@@ -1565,7 +1503,7 @@ int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) { + return -1; + } + +- ret = bpf_xdp_attach(ifindex, progfd, flags, NULL); ++ ret = bpf_set_link_xdp_fd(ifindex, progfd, flags); + if (ret) { + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "bpf: Attaching prog to %s: %s\n", dev_name, err_buf); +diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h +index dd86f0a9..e001d740 100644 +--- a/src/cc/libbpf.h ++++ b/src/cc/libbpf.h +@@ -27,25 +27,8 @@ + extern "C" { + #endif + +-struct bcc_create_map_attr { +- const char *name; +- enum bpf_map_type map_type; +- __u32 map_flags; +- __u32 key_size; +- __u32 value_size; +- __u32 max_entries; +- __u32 numa_node; +- __u32 btf_fd; +- __u32 btf_key_type_id; +- __u32 btf_value_type_id; +- __u32 map_ifindex; +- union { +- __u32 inner_map_fd; +- __u32 btf_vmlinux_value_type_id; +- }; +-}; +- +-struct bpf_prog_load_opts; ++struct bpf_create_map_attr; ++struct bpf_load_program_attr; + + enum bpf_probe_attach_type { + BPF_PROBE_ENTRY, +@@ -61,7 +44,7 @@ struct bcc_perf_buffer_opts { + int bcc_create_map(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + int map_flags); +-int bcc_create_map_xattr(struct bcc_create_map_attr *attr, bool allow_rlimit); ++int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit); + int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); + int bpf_lookup_elem(int fd, void *key, void *value); + int bpf_delete_elem(int fd, void *key); +@@ -89,11 +72,10 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size); +-int bcc_prog_load_xattr(enum bpf_prog_type prog_type, const char *prog_name, +- const char *license, const struct bpf_insn *insns, +- struct bpf_prog_load_opts *opts, ++int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, + int prog_len, char *log_buf, + unsigned log_buf_size, bool allow_rlimit); ++ + int bpf_attach_socket(int sockfd, int progfd); + + /* create RAW socket. If name is not NULL/a non-empty null-terminated string, +-- +2.38.1 + diff --git a/bcc-0.25.0-Revert-tools-tcpaccept-Fix-support-for-v5.6-kernels.patch b/bcc-0.25.0-Revert-tools-tcpaccept-Fix-support-for-v5.6-kernels.patch new file mode 100644 index 0000000..1c31ae9 --- /dev/null +++ b/bcc-0.25.0-Revert-tools-tcpaccept-Fix-support-for-v5.6-kernels.patch @@ -0,0 +1,42 @@ +From acee5d39d24b102e8ed09a242cb1c53246a1fb7f Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Tue, 29 Nov 2022 15:33:49 +0100 +Subject: [PATCH] Revert "tools/tcpaccept: Fix support for v5.6+ kernels" + +This reverts commit 28955512d991ee3849c2a9accfc54bef9cd35f21. + +It breaks tcpaccept on RHEL 8 kernel. +--- + tools/tcpaccept.py | 9 ++------- + 1 file changed, 2 insertions(+), 7 deletions(-) + +diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py +index b2ace4fa..d3e44143 100755 +--- a/tools/tcpaccept.py ++++ b/tools/tcpaccept.py +@@ -116,7 +116,7 @@ int kretprobe__inet_csk_accept(struct pt_regs *ctx) + return 0; + + // check this is TCP +- u16 protocol = 0; ++ u8 protocol = 0; + // workaround for reading the sk_protocol bitfield: + + // Following comments add by Joe Yin: +@@ -132,12 +132,7 @@ int kretprobe__inet_csk_accept(struct pt_regs *ctx) + int gso_max_segs_offset = offsetof(struct sock, sk_gso_max_segs); + int sk_lingertime_offset = offsetof(struct sock, sk_lingertime); + +- +- // Since kernel v5.6 sk_protocol is its own u16 field and gso_max_segs +- // precedes sk_lingertime. +- if (sk_lingertime_offset - gso_max_segs_offset == 2) +- protocol = newsk->sk_protocol; +- else if (sk_lingertime_offset - gso_max_segs_offset == 4) ++ if (sk_lingertime_offset - gso_max_segs_offset == 4) + // 4.10+ with little endian + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + protocol = *(u8 *)((u64)&newsk->sk_gso_max_segs - 3); +-- +2.38.1 + diff --git a/bcc-0.25.0-libbpf-Allow-kernel_struct_has_field-to-reach-field-.patch b/bcc-0.25.0-libbpf-Allow-kernel_struct_has_field-to-reach-field-.patch new file mode 100644 index 0000000..66baec5 --- /dev/null +++ b/bcc-0.25.0-libbpf-Allow-kernel_struct_has_field-to-reach-field-.patch @@ -0,0 +1,75 @@ +From 16277e3910c9281d807fc6d3b4ce41c62d7d265e Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Thu, 19 May 2022 16:37:40 +0200 +Subject: [PATCH 3/3] libbpf: Allow kernel_struct_has_field to reach field in + unnamed struct or union + +Some fields can belong to unnamed struct or union (e.g. rcu and +rcu_users fields of task_struct). In C, they are accessed as if their +belong directly to the parent of the unnamed struct or union but this +is not the case for BTF. + +When looking for a field, kernel_struct_has_field should also look +reccursively into unnamed structs or unions. That allows code such as +the following to work as expected: + +BPF.kernel_struct_has_field('task_struct', 'rcu') + +Signed-off-by: Jerome Marchand +--- + src/cc/libbpf.c | 28 ++++++++++++++++++---------- + 1 file changed, 18 insertions(+), 10 deletions(-) + +diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c +index 5f7a3f68..bdfde1f5 100644 +--- a/src/cc/libbpf.c ++++ b/src/cc/libbpf.c +@@ -1319,12 +1319,27 @@ bool bpf_has_kernel_btf(void) + return true; + } + ++static int find_member_by_name(struct btf *btf, const struct btf_type *btf_type, const char *field_name) { ++ const struct btf_member *btf_member = btf_members(btf_type); ++ int i; ++ ++ for (i = 0; i < btf_vlen(btf_type); i++, btf_member++) { ++ const char *name = btf__name_by_offset(btf, btf_member->name_off); ++ if (!strcmp(name, field_name)) { ++ return 1; ++ } else if (name[0] == '\0') { ++ if (find_member_by_name(btf, btf__type_by_id(btf, btf_member->type), field_name)) ++ return 1; ++ } ++ } ++ return 0; ++} ++ + int kernel_struct_has_field(const char *struct_name, const char *field_name) + { + const struct btf_type *btf_type; +- const struct btf_member *btf_member; + struct btf *btf; +- int i, ret, btf_id; ++ int ret, btf_id; + + btf = libbpf_find_kernel_btf(); + ret = libbpf_get_error(btf); +@@ -1338,14 +1353,7 @@ int kernel_struct_has_field(const char *struct_name, const char *field_name) + } + + btf_type = btf__type_by_id(btf, btf_id); +- btf_member = btf_members(btf_type); +- for (i = 0; i < btf_vlen(btf_type); i++, btf_member++) { +- if (!strcmp(btf__name_by_offset(btf, btf_member->name_off), field_name)) { +- ret = 1; +- goto cleanup; +- } +- } +- ret = 0; ++ ret = find_member_by_name(btf, btf_type, field_name); + + cleanup: + btf__free(btf); +-- +2.38.1 + diff --git a/bcc.spec b/bcc.spec index a1cceb6..7f3f43c 100644 --- a/bcc.spec +++ b/bcc.spec @@ -1,4 +1,3 @@ -%define anolis_release .0.1 # luajit is not available RHEL 8 %bcond_with lua @@ -9,21 +8,19 @@ %endif Name: bcc -Version: 0.19.0 -Release: 5%{anolis_release}%{?dist} +Version: 0.25.0 +Release: 2%{?dist} Summary: BPF Compiler Collection (BCC) License: ASL 2.0 URL: https://github.com/iovisor/bcc Source0: %{url}/archive/v%{version}/%{name}-%{version}.tar.gz Patch0: %{name}-%{version}-Manpages-remove-unstable-statement.patch -Patch1: %{name}-%{version}-Fix-BPF-src_file-foo.patch -Patch2: %{name}-%{version}-Define-missing-BPF_-macros.patch -Patch3: %{name}-%{version}-fix-llvm-compilation-errors.patch -Patch4: %{name}-%{version}-Fix-a-llvm-compilation-error.patch -Patch5: %{name}-%{version}-Remove-APInt-APSInt-toString-std-string-variants.patch -Patch6: %{name}-%{version}-Handle-renaming-of-task_struct_-state-field-on-RHEL-.patch - -Patch1000: 1000-bcc-anolis-ali-enhanced-tools-for-fs-schedule-sys.patch +Patch1: %{name}-%{version}-RHEL-libbpf-version-fixes.patch +Patch2: %{name}-%{version}-libbpf-Allow-kernel_struct_has_field-to-reach-field-.patch +Patch3: %{name}-%{version}-Fix-bpf_pseudo_fd-type-conversion-error.patch +Patch4: %{name}-%{version}-Fix-clang-15-int-to-pointer-conversion-errors.patch +Patch5: %{name}-%{version}-Revert-tools-tcpaccept-Fix-support-for-v5.6-kernels.patch +Patch6: %{name}-%{version}-Fix-get_kprobe_functions.patch # Arches will be included as upstream support is added and dependencies are # satisfied in the respective arches @@ -44,9 +41,9 @@ BuildRequires: ncurses-devel %if %{with lua} BuildRequires: pkgconfig(luajit) %endif -BuildRequires: libbpf-devel >= 0.0.9, libbpf-static >= 0.0.9 +BuildRequires: libbpf-devel >= 0.5.0, libbpf-static >= 0.5.0 -Requires: libbpf >= 0.0.9 +Requires: libbpf >= 0.5.0 Requires: tar Recommends: kernel-devel Recommends: %{name}-tools = %{version}-%{release} @@ -103,6 +100,7 @@ Standalone tool to run BCC tracers written in Lua %package tools Summary: Command line tools for BPF Compiler Collection (BCC) +Requires: bcc = %{version}-%{release} Requires: python3-%{name} = %{version}-%{release} Requires: python3-netaddr @@ -220,8 +218,22 @@ done %changelog -* Sun Apr 24 2022 Chunmei Xu - 0.19.0-5.0.1 -- add ali enhanced tools for fs schedule and sys +* Tue Jan 10 2023 Jerome Marchand - 0.25.0-2 +- Fix tcpdrop tool + +* Wed Nov 30 2022 Jerome Marchand - 0.25.0-1 +- Rebase to bcc-0.25.0 +- Rebuild on LLVM 15 + +* Thu Jun 23 2022 Jerome Marchand - 0.24.0-2 +- Rebuild on libbpf 0.5.0 + +* Thu Jun 09 2022 Jerome Marchand - 0.24.0-1 +- Rebase to bcc-0.24.0 +- Rebuild on LLVM 14 + +* Mon Dec 06 2021 Jerome Marchand - 0.19.0-6 +- Add excplicit requirement in bcc-tools for rpmdiff * Tue Nov 23 2021 Jerome Marchand - 0.19.0-5 - Handle the renaming of task_struct_>state field diff --git a/download b/download index c460eb7..3f2bc08 100644 --- a/download +++ b/download @@ -1 +1 @@ -edb940b8f70efb9953446548db3c9a11 bcc-0.19.0.tar.gz +6cfc69c9fac8d1efe8902e8cc46f6495 bcc-0.25.0.tar.gz -- Gitee From c19da29b9662074380c91d5c7a5ab88a31702b90 Mon Sep 17 00:00:00 2001 From: Weisson Date: Fri, 30 Jun 2023 17:41:50 +0800 Subject: [PATCH 2/2] Remove loongarch64 arch. Signed-off-by: Weisson --- bcc.spec | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bcc.spec b/bcc.spec index 7f3f43c..dcb1728 100644 --- a/bcc.spec +++ b/bcc.spec @@ -1,3 +1,5 @@ +%global anolis_release .0.1 + # luajit is not available RHEL 8 %bcond_with lua @@ -9,7 +11,7 @@ Name: bcc Version: 0.25.0 -Release: 2%{?dist} +Release: 2%{anolis_release}%{?dist} Summary: BPF Compiler Collection (BCC) License: ASL 2.0 URL: https://github.com/iovisor/bcc @@ -25,6 +27,7 @@ Patch6: %{name}-%{version}-Fix-get_kprobe_functions.patch # Arches will be included as upstream support is added and dependencies are # satisfied in the respective arches ExcludeArch: i686 +Excludearch: loongarch64 BuildRequires: bison BuildRequires: cmake >= 2.8.7 @@ -218,6 +221,9 @@ done %changelog +* Fri Jun 30 2023 Weisson - 5.14.0-4.0.1 +- Remove loongarch64 arch + * Tue Jan 10 2023 Jerome Marchand - 0.25.0-2 - Fix tcpdrop tool -- Gitee