diff --git a/License/LICENSE b/License/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ee5839968a2bf86c93283efc09d40fd050b7cfa2 --- /dev/null +++ b/License/LICENSE @@ -0,0 +1,127 @@ + 木兰宽松许可证, 第2版 + + 木兰宽松许可证, 第2版 + 2020年1月 http://license.coscl.org.cn/MulanPSL2 + + + 您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: + + 0. 定义 + + “软件”是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 + + “贡献”是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 + + “贡献者”是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 + + “法人实体”是指提交贡献的机构及其“关联实体”。 + + “关联实体”是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 + + 1. 授予版权许可 + + 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 + + 2. 授予专利许可 + + 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 + + 3. 无商标许可 + + “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 + + 4. 分发限制 + + 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 + + 5. 免责声明与责任限制 + + “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 + + 6. 语言 + “本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 + + 条款结束 + + 如何将木兰宽松许可证,第2版,应用到您的软件 + + 如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: + + 1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; + + 2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; + + 3, 请将如下声明文本放入每个源文件的头部注释中。 + + Copyright (c) [Year] [name of copyright holder] + [Software Name] is licensed under Mulan PSL v2. + You can use this software according to the terms and conditions of the Mulan PSL v2. + You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 + THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + See the Mulan PSL v2 for more details. + + + Mulan Permissive Software License,Version 2 + + Mulan Permissive Software License,Version 2 (Mulan PSL v2) + January 2020 http://license.coscl.org.cn/MulanPSL2 + + Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: + + 0. Definition + + Software means the program and related documents which are licensed under this License and comprise all Contribution(s). + + Contribution means the copyrightable work licensed by a particular Contributor under this License. + + Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. + + Legal Entity means the entity making a Contribution and all its Affiliates. + + Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. + + 1. Grant of Copyright License + + Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. + + 2. Grant of Patent License + + Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. + + 3. No Trademark License + + No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4. + + 4. Distribution Restriction + + You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. + + 5. Disclaimer of Warranty and Limitation of Liability + + THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + + 6. Language + + THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. + + END OF THE TERMS AND CONDITIONS + + How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software + + To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: + + i Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; + + ii Create a file named “LICENSE” which contains the whole context of this License in the first directory of your software package; + + iii Attach the statement to the appropriate annotated syntax at the beginning of each source file. + + + Copyright (c) [Year] [name of copyright holder] + [Software Name] is licensed under Mulan PSL v2. + You can use this software according to the terms and conditions of the Mulan PSL v2. + You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 + THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + See the Mulan PSL v2 for more details. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bbfbea341d80bc5caf45f88cede4a717cefe70b1 --- /dev/null +++ b/Makefile @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang -v +BPFTOOL ?= $(abspath tools/bpftool) +PAHOLE ?= $(abspath tools/pahole) +READELF ?= readelf +VMLINUX ?= /usr/lib/debug/lib/modules/`uname -r`/vmlinux +VMLINUX_HEADER ?= $(OUTPUT)/vmlinux.h + +BTF_PAHOLE_PROBE := $(shell $(READELF) -S $(VMLINUX) | grep .BTF 2>&1) +INCLUDES := -I$(OUTPUT) +CFLAGS := -g -Wall +ARCH := $(shell uname -m | sed 's/x86_64/x86/') + +ifeq ($(BTF_PAHOLE_PROBE),) + DWARF2BTF = y +endif + +APPS = readahead_tune + +CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +.PHONY: all +all: $(APPS) + +debug: DEBUG_FLAGS = -DBPFDEBUG +debug: all + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(VMLINUX_HEADER): + $(call msg,GEN-VMLINUX_H,$@) +ifeq ($(DWARF2BTF),y) + $(Q)$(PAHOLE) -J $(VMLINUX) +endif + $(Q)$(BPFTOOL) btf dump file $(VMLINUX) format c > $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(wildcard %.h) | $(OUTPUT) $(VMLINUX_HEADER) + $(call msg,BPF,$@) + $(Q)$(CLANG) -D__KERNEL__ -D__ASM_SYSREG_H -D__TARGET_ARCH_$(ARCH) \ + $(DEBUG_FLAGS) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -g -O2 -target bpf -c $(filter %.c,$^) -o $@ + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build common-helper code +$(OUTPUT)/common_helper.o: common_helper.c | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) -o $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +# 动态链接的场景下尝试将-lelf和-lz去掉 +$(APPS): %: $(OUTPUT)/%.o $(OUTPUT)/common_helper.o | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ -lbpf -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/README.en.md b/README.en.md deleted file mode 100644 index 36713033942086f4e355376e0c047550f1328d64..0000000000000000000000000000000000000000 --- a/README.en.md +++ /dev/null @@ -1,36 +0,0 @@ -# A-Tune-BPF-Collection - -#### Description -A-Tune-BPF-Collection is BPF based tunning tools collection - -#### Software Architecture -Software architecture description - -#### Installation - -1. xxxx -2. xxxx -3. xxxx - -#### Instructions - -1. xxxx -2. xxxx -3. xxxx - -#### Contribution - -1. Fork the repository -2. Create Feat_xxx branch -3. Commit your code -4. Create Pull Request - - -#### Gitee Feature - -1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md -2. Gitee blog [blog.gitee.com](https://blog.gitee.com) -3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore) -4. The most valuable open source project [GVP](https://gitee.com/gvp) -5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help) -6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) diff --git a/README.md b/README.md index 47b930d988d92167b132438e8caf2b0a99aebc4a..0941b3d2b281b684d8ce2ec13246c2cb6d0cf7de 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,38 @@ # A-Tune-BPF-Collection #### 介绍 -A-Tune-BPF-Collection is BPF based tunning tools collection +A-Tune-BPF-Collection是BPF工具集,这些BPF程序用以跟踪内核行为模式以实时细粒度地调整内核参数,达到提升系统性能的目标。 #### 软件架构 -软件架构说明 +以`readahead_tune`为例来介绍A-Tune-BPF-Collection中的BPF程序。 +`readahead_tune`包含两部分: + +* BPF program(`readahead_tune.bpf`):加载到内核的BPF program以跟踪ext4/xfs文件系统的文件读操作 +* BPF control program(`readahead_tune`):读取配置文件,配置BPF program的参数,随即加载BPF program到内核 #### 安装教程 -1. xxxx -2. xxxx -3. xxxx +通过rpm命令或者yum安装`A-Tune-BPF-Collection` rpm包: + +``` +# yum install A-Tune-BPF-Collection +or +# rpm -ivh A-Tune-BPF-Collection-{version}.x86_64.rpm +``` #### 使用说明 -1. xxxx -2. xxxx -3. xxxx +以`readahead_tune`为例介绍A-Tune-BPF-Collection中的BPF程序程序使用: + +1. (可选)默认配置文件会安装在`/etc/sysconfig/readahead_tune.conf`,也可以自己新建配置文件,通过`start_readahead_tune`命令`-c|--config`选项指令配置文件路径。若未指定配置文件,则会使用默认安装的配置文件。 + + ``` + 注意:仅支持通过完整路径名指定配置文件,相对路径会无法识别;配置文件中若存在不合法的选项配置或者配置选项缺失,都会使用该选项的默认配置值。 + ``` + +2. 通过`start_readahead_tune`命令启动/加载`readahead_tune.bpf` BPF Program。命令使用方法可以使用`start_readahead_tune -h|--help`帮助命令。 +3. 通过`stop_readahead_tune`命令停止/卸载`readahead_tune.bpf` BPF Program。 #### 参与贡献 diff --git a/atune_bpf_collection.spec b/atune_bpf_collection.spec new file mode 100644 index 0000000000000000000000000000000000000000..2afb972248cf894d8fef1838456e91ad2dad9b97 --- /dev/null +++ b/atune_bpf_collection.spec @@ -0,0 +1,43 @@ +Name: A-Tune-BPF-Collection +Version: 0.1 +Release: 1 +License: Mulan PSL v2 +Summary: BPF program collection to adjust fine-grained kernel mode to get better performance +URL: https://gitee.com/openeuler/A-Tune-BPF-Collection +Source0: https://gitee.com/openeuler/A-Tune-BPF-Collection/repository/archive/v%{version}.tar.gz + +BuildRequires: clang, llvm, libbpf-devel +Requires: libbpf +Provides: readahead_tune + +%define debug_package %{nil} + +%description +A-Tune BPF Collection contains a set of BPF program which can interact with kernel in real time. +It has the following capabilities: +readahead_tune: trace file reading characteristics, then ajust file read mode to get maximum I/O efficency + +%prep +%autosetup -n %{name}-%{version} -p1 + +%build +make %{?_smp_mflags} + +%install +install -D -p -m 0755 readahead_tune %{buildroot}/%{_sbindir}/readahead_tune +install -D -p -m 0755 start_readahead_tune %{buildroot}/%{_sbindir}/start_readahead_tune +install -D -p -m 0755 stop_readahead_tune %{buildroot}/%{_sbindir}/stop_readahead_tune +install -D -p -m 0644 readahead_tune.conf %{buildroot}%{_sysconfdir}/sysconfig/readahead_tune.conf + +%files +%{_sbindir}/readahead_tune +%{_sbindir}/start_readahead_tune +%{_sbindir}/stop_readahead_tune +%config(noreplace) %{_sysconfdir}/sysconfig/readahead_tune.conf + +%changelog +* Tue Nov 9 2021 lvying - 0.1-1 +- Type:feature +- ID:NA +- SUG:NA +- DESC: Init A-Tune-BPF-Collection repo and add readahead_tune service diff --git a/common_helper.c b/common_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..703f19c65d07a43d46ed365b7946ea7214c28e9f --- /dev/null +++ b/common_helper.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include "common_helper.h" + +static void free_opt(struct opt *opt) +{ + if (!opt) { + return; + } + free(opt->name); + free(opt->val); + free(opt); +} + +static struct opt *new_opt(char *name, char *val) +{ + struct opt *opt = (struct opt*)malloc(sizeof(struct opt)); + if (opt) { + opt->name = strdup(name); + opt->val = strdup(val); + opt->next = NULL; + if (!opt->name || !opt->val) { + free_opt(opt); + opt = NULL; + } + } + return opt; +} + +struct opt **parse_init(unsigned int size) +{ + struct opt **opts = (struct opt **)malloc(size * sizeof(struct opt *)); + if (opts) + memset(opts, 0, size * sizeof(struct opt*)); + return opts; +} + +void parse_fini(struct opt **opts, unsigned int size) +{ + if (!opts) { + return; + } + + for (unsigned int i = 0; i < size; i++) { + if (opts[i] != NULL) { + free_opt(opts[i]); + } + } +} + +static int empty(char *s) +{ + while (isspace(*s)) { + ++s; + } + return *s == 0; +} + +static char *strstrip(char *s) +{ + while (isspace(*s)) { + s++; + } + char *p = s + strlen(s) - 1; + if (p <= s) { + return s; + } + while (isspace(*p) && p >= s) { + *p-- = 0; + } + return s; +} + +/* djb hash */ +static unsigned hash(const char *str, unsigned int size) +{ + unsigned hash = 5381; + + for (const unsigned char *s = (const unsigned char *)str; *s; s++) { + hash = (hash * 32) + hash + *s; + } + return hash % size; +} + +int parse_config_file(unsigned int where, const char *conf_fn, struct opt **opts, unsigned int size) +{ + char *line = NULL; + size_t linelen = 0; + char *val = NULL; + unsigned int lineno = 1; + int ret = 0; + + FILE *f = fopen(conf_fn, "r"); + if (!f) { + log(where, LOG_ERR, "fopen config file(%s) failed\n", conf_fn); + return -1; + } + + while (getline(&line, &linelen, f) > 0) { + char *s = strchr(line, '#'); + if (s) { + *s = 0; + } + s = strstrip(line); + if ((val = strchr(line, '=')) != NULL) { + *val++ = 0; + char *name = strstrip(s); + val = strstrip(val); + struct opt * opt = new_opt(name, val); + if (!opt) { + ret = -1; + log(where, LOG_ERR, "failed to alloc opt struct\n"); + goto cleanup; + } + unsigned int h = hash(name, size); + if (opts[h]) { + struct opt *next = opts[h]->next; + opts[h]->next = opt; + opt->next = next; + } else { + opts[h] = opt; + } + } else if (!empty(s)) { + ret = -1; + log(where, LOG_ERR, "config file(%s) line(%u) is not field\n", conf_fn, lineno); + goto cleanup; + } + lineno++; + } + goto ret; + +cleanup: + parse_fini(opts, size); + +ret: + fclose(f); + free(line); + return ret; +} + +char* config_opt(struct opt **opts, unsigned int size, const char *name) +{ + unsigned int h = hash(name, size); + struct opt *header = opts[h]; + while (header) { + if (strcmp(header->name, name)) { + header = header->next; + } else { + break; + } + } + return header ? header->val : NULL; +} + +void bump_memlock_rlimit(unsigned int where) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { + log(where, LOG_ERR, "Failed to increase RLIMIT_MEMLOCK limit!\n"); + exit(1); + } +} diff --git a/common_helper.h b/common_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..8fc52a610cfc058c5e174b16cd95335836e4ed49 --- /dev/null +++ b/common_helper.h @@ -0,0 +1,32 @@ +#ifndef _COMMON_HELPER_H +#define _COMMON_HELPER_H + +#include + +#define SHASH 11 +#define SYSLOG (1 << 0) +#define TERM (1 << 1) +#define ALL (SYSLOG | TERM) + +#define log(where, level, fmt, args...) do { \ + if (where & SYSLOG) \ + syslog(level, fmt, ##args); \ + if (where & TERM) { \ + fprintf(stderr, fmt, ##args); \ + fflush(stderr); \ + } \ +} while (0) + +struct opt { + struct opt *next; + char *name; + char *val; +}; + +struct opt **parse_init(unsigned int size); +void parse_fini(struct opt **opts, unsigned int size); +int parse_config_file(unsigned int where, const char *conf_fn, struct opt **opts, unsigned int size); +char* config_opt(struct opt **opts, unsigned int size, const char *name); +void bump_memlock_rlimit(unsigned int where); + +#endif diff --git a/readahead_tune.bpf.c b/readahead_tune.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..544814f90388b63012282328b51f1dd5dcfacb01 --- /dev/null +++ b/readahead_tune.bpf.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 OR MulanPSL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#define BPF_NO_PRESERVE_ACCESS_INDEX + +#include "vmlinux.h" +#include +#include "readahead_tune.h" + +/* + * Need to keep consistent with definitions in include/linux/fs.h + * vmlinux.h does not contain macro + */ +#define FMODE_RANDOM 0x1000 +#define FMODE_WILLNEED 0x400000 + +#define PREFIX_PATTERN "blk_" +#define MAX_HASH_TABLE_ENTRY 10000 +#define MAP_ARRAY_SIZE 10 + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hnode { + __u64 last_nsec; + __u64 seq_nr; + __u64 tot_nr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, unsigned long long); + __uint(max_entries, MAP_ARRAY_SIZE); +} arraymap SEC(".maps"); + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct file_rd_hnode), + .max_entries = MAX_HASH_TABLE_ENTRY, +}; + +/* + * This first paramater should be the exact position of variable, otherwise the start + * position of array, and then access the variable by array[index], BPF verifier will + * warn: "load bpf program failed: Permission denied ... variable stack access var_off" + */ +static __always_inline void get_conf(unsigned long long *file_read_conf, unsigned int index) +{ + const char conf_fmt[] = "option is %llu\n"; + void *value = bpf_map_lookup_elem(&arraymap, &index); + if (value) { + *file_read_conf = *(unsigned long long *)value; + } + bpf_trace_printk(conf_fmt, sizeof(conf_fmt), *file_read_conf); +} + +static __always_inline bool is_expected_file(void *name) +{ + char prefix[5]; + int err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) { + return false; + } + return !__builtin_memcmp(prefix, PREFIX_PATTERN, sizeof(PREFIX_PATTERN) - 1); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + unsigned long long file_read_conf[CONF_NUM] = { + DEFAULT_FILESZ, + DEFAULT_READ_TIME, + DEFAULT_TOTAL_READ, + DEFAULT_LOWER_BOUND, + DEFAULT_UPPER_BOUND + }; + + if (!is_expected_file((void *)rd_ctx->name)) { + return 0; + } + + /* + * Get user configuration, 4.19 kernel does not support + * BPF program for-loop + */ + get_conf(file_read_conf + CONF_FILESZ, CONF_FILESZ); + get_conf(file_read_conf + CONF_READ_TIME, CONF_READ_TIME); + get_conf(file_read_conf + CONF_TOTAL_READ, CONF_TOTAL_READ); + get_conf(file_read_conf + CONF_LOWER_BOUND, CONF_LOWER_BOUND); + get_conf(file_read_conf + CONF_UPPER_BOUND, CONF_UPPER_BOUND); + + if (rd_ctx->i_size <= file_read_conf[CONF_FILESZ]) { + rd_ctx->set_f_mode = FMODE_WILLNEED; + return 0; + } + + __u64 now = bpf_ktime_get_ns(); + __u64 key = rd_ctx->key; + bool first = false; + struct file_rd_hnode *hist = bpf_map_lookup_elem(&htab, &key); + struct file_rd_hnode new_hist; + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + /* the consecutive read pos of the same file spatially local */ + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) { + hist->seq_nr += 1; + } + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + if (now - hist->last_nsec >= file_read_conf[CONF_READ_TIME] || hist->tot_nr >= file_read_conf[CONF_TOTAL_READ]) { + if (hist->tot_nr >= file_read_conf[CONF_TOTAL_READ]) { + if (hist->seq_nr <= hist->tot_nr * file_read_conf[CONF_LOWER_BOUND] / HUNDRED_PERCENTAGE) { + rd_ctx->set_f_mode = FMODE_RANDOM; + } else if (hist->seq_nr >= hist->tot_nr * file_read_conf[CONF_UPPER_BOUND] / HUNDRED_PERCENTAGE) { + rd_ctx->clr_f_mode = FMODE_RANDOM; + } + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value = bpf_map_lookup_elem(&htab, &key); + if (value) { + bpf_map_delete_elem(&htab, &key); + } + + return 0; +} diff --git a/readahead_tune.c b/readahead_tune.c new file mode 100644 index 0000000000000000000000000000000000000000..689370cc2741fb2c145aecf2c3314a8d43a868b8 --- /dev/null +++ b/readahead_tune.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "readahead_tune.h" +#include "common_helper.h" +#include "readahead_tune.skel.h" + +#define DEFAULT_CONF "/etc/sysconfig/readahead_tune.conf" + +struct env_conf { + const char *name; + unsigned long long default_val; +}; + +const struct env_conf confs[CONF_NUM] = { + {"filesz-threshold", DEFAULT_FILESZ}, + {"read-time-threshold", DEFAULT_READ_TIME}, + {"total-read-threshold", DEFAULT_TOTAL_READ}, + {"lower-bound-percentage", DEFAULT_LOWER_BOUND}, + {"upper-bound-percentage", DEFAULT_UPPER_BOUND} +}; + +static int parse_config(unsigned int where, struct readahead_tune_bpf *skel, const char *conf_fn) +{ + unsigned long long parse_conf[CONF_NUM] = {0}; + struct opt **opts = parse_init(SHASH); + if (!opts) { + log(where, LOG_ERR, "parse_init failed, all the option use default value\n"); + goto use_default; + } + + if (parse_config_file(where, conf_fn, opts, SHASH)) { + log(where, LOG_ERR, "parse_config_file failed, all the option use default value\n"); + goto use_default; + } + + for (int i = 0; i < CONF_NUM; i++) { + char *env_str = config_opt(opts, SHASH, confs[i].name); + if (env_str && strlen(env_str)) { + char *endptr; + errno = 0; + long long val = strtoll(env_str, &endptr, 10); + + if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) + || (errno != 0 && val == 0) + || endptr == env_str + || *endptr != '\0' + || val < 0) { + log(where, LOG_ERR, "Option %s value is %s, use default!\n", confs[i].name, env_str); + parse_conf[i] = confs[i].default_val; + } else { + parse_conf[i] = (unsigned long long)val; + } + } else { + parse_conf[i] = confs[i].default_val; + } + } + + if (parse_conf[CONF_LOWER_BOUND] >= parse_conf[CONF_UPPER_BOUND] + || parse_conf[CONF_LOWER_BOUND] <= ZERO_PERCENTAGE + || parse_conf[CONF_UPPER_BOUND] >= HUNDRED_PERCENTAGE) { + log(where, LOG_ERR, "lower-bound-percentage(%llu), upper-bound-percentage(%llu) is invalid, use default\n", + parse_conf[CONF_LOWER_BOUND], parse_conf[CONF_UPPER_BOUND]); + parse_conf[CONF_LOWER_BOUND] = DEFAULT_LOWER_BOUND; + parse_conf[CONF_UPPER_BOUND] = DEFAULT_UPPER_BOUND; + } + + if (parse_conf[CONF_TOTAL_READ] * parse_conf[CONF_LOWER_BOUND] < parse_conf[CONF_TOTAL_READ] + || parse_conf[CONF_TOTAL_READ] * parse_conf[CONF_UPPER_BOUND] < parse_conf[CONF_TOTAL_READ]) { + log(where, LOG_ERR, "total-read-threshold(%llu) is too large, use default\n", parse_conf[CONF_TOTAL_READ]); + parse_conf[CONF_TOTAL_READ] = DEFAULT_TOTAL_READ; + } + goto success_parse; + +use_default: + for (int i = 0; i < CONF_NUM; i++) { + parse_conf[i] = confs[i].default_val; + } + +success_parse: + parse_fini(opts, SHASH); + for (unsigned int i = 0; i < CONF_NUM; i++) { + if (bpf_map_update_elem(bpf_map__fd(skel->maps.arraymap), &i, parse_conf + i, BPF_ANY)) { + return -1; + } + } + log(where, LOG_INFO, "All the file_read_conf finally set as following:\n"); + for (int i = 0; i < CONF_NUM; i++) { + log(where, LOG_INFO, "Config %s = %llu\n", confs[i].name, parse_conf[i]); + } + return 0; +} + +bool verbose; +bool foreground; +const char * config_file = DEFAULT_CONF; +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { "foreground", 'f', NULL, 0, "Run foreground, not daemonize" }, + { "config", 'c', "CONFIG_FILE", 0, "Config file to specify" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'v': + verbose = true; + break; + case 'f': + foreground = true; + break; + case 'c': + config_file = !arg ? DEFAULT_CONF : arg; + break; + case ARGP_KEY_ARG: + argp_usage(state); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static const struct argp argp = { + .options = opts, + .parser = parse_arg, +}; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level >= LIBBPF_DEBUG && !verbose) { + return 0; + } + /* syslog output is Mojibake, and No log_buf to check kernel BPF verifier log */ + if (foreground) { + return vfprintf(stderr, format, args); + } + syslog(LOG_ERR, format, args); + return 0; +} + +static volatile bool exiting = false; + +static void sig_handler(int sig) +{ + log((foreground ? TERM : SYSLOG), LOG_INFO, "Gracefully exit...\n"); + exiting = true; +} + +int main(int argc, char *argv[]) +{ + struct readahead_tune_bpf *skel; + unsigned int where = TERM; + + int err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) { + return err; + } + + if (!foreground) { + err = daemon(0, 0); + where = SYSLOG; + if (err) { + perror("Failed to daemon\n"); + return err; + } + } + + signal(SIGTERM, sig_handler); + + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Bump RLIMIT_MEMLOCK to create BPF maps */ + bump_memlock_rlimit(where); + + skel = readahead_tune_bpf__open_and_load(); + if (!skel) { + log(where, LOG_ERR, "Failed to open and load BPF skeleton\n"); + return -1; + } + + err = parse_config(where, skel, config_file); + if (err) { + log(where, LOG_ERR, "Failed to write config value into BPF ARRAY MAP\n"); + goto cleanup; + } + + err = readahead_tune_bpf__attach(skel); + if (err) { + log(where, LOG_ERR, "Failed to attach BPF skeleton\n"); + goto cleanup; + } + + while (!exiting) { + pause(); + } + +cleanup: + readahead_tune_bpf__destroy(skel); + return err; +} diff --git a/readahead_tune.conf b/readahead_tune.conf new file mode 100644 index 0000000000000000000000000000000000000000..953e17ef9a8955aed844b6755b1f70cb15106cfd --- /dev/null +++ b/readahead_tune.conf @@ -0,0 +1,29 @@ +# readahead tune +# Note: Run-time configuration is unsupported, service restart needed. +# Note: this file should be installed at /etc/sysconfig/readahead_tune.conf + +# Specify the threshold of file size. +# BPF program only trace the file whoes file size exceeds threshold +# Default unit is byte, other unit conf is not supported, default is 4MB +filesz-threshold=4194304 + +# Specify the sampling times of file read +# BPF program judges the file reading characteristics based on the sampling times of file read +# and then adjusts the file reading mode +total-read-threshold=10 + +# Specify the time interval threshold between two consecutive readings of the same file +# If consecutive file read exceeds the time interval threshold, BPF program resamples +# the file read to determine the file reading characteristics +# Default unit is ns, other unit conf is not supported, default is 500ms +read-time-threshold=500000000 + +# Specify the lower bound percentage of sequential read ratio +# If sequential read ratio is lower than the lower bound, BPF program will set file read mode FMODE_RANDOM +# the range is (0, 100) +lower-bound-percentage=30 + +# Specify the upper bound percentage of sequential read ratio +# If sequential read ratio is upper than the upper bound, BPF program will clear file read mode FMODE_RANDOM +# the range is (0, 100), upper-bound-percentage > lower-bound-percentage is required +upper-bound-percentage=70 diff --git a/readahead_tune.h b/readahead_tune.h new file mode 100644 index 0000000000000000000000000000000000000000..68a9c3d416e6e3abfe0dc8af9f48da513ef86816 --- /dev/null +++ b/readahead_tune.h @@ -0,0 +1,21 @@ +#ifndef _READAHEAD_TUNE_H +#define _READAHEAD_TUNE_H + +#define ZERO_PERCENTAGE 0 +#define HUNDRED_PERCENTAGE 100 +#define DEFAULT_FILESZ (4<< 20) +#define DEFAULT_READ_TIME 500000000ULL +#define DEFAULT_TOTAL_READ 10 +#define DEFAULT_LOWER_BOUND 30 +#define DEFAULT_UPPER_BOUND 70 + +enum conf_type { + CONF_FILESZ, + CONF_READ_TIME, + CONF_TOTAL_READ, + CONF_LOWER_BOUND, + CONF_UPPER_BOUND, + CONF_NUM, +}; + +#endif diff --git a/start_readahead_tune b/start_readahead_tune new file mode 100755 index 0000000000000000000000000000000000000000..afae00c00530b9afe5fc095dbbe31829d9365ce9 --- /dev/null +++ b/start_readahead_tune @@ -0,0 +1,35 @@ +#!/bin/bash + +function usage() +{ + echo "Usage: $0 [ -h | --help] [ -c | --config CONFIG_FILE ]" + exit 0 +} + +options=$(getopt -q -o hc: --long help,config: -- "$@") +if [ $? -ne 0 ]; then + echo "Invaild option!" + usage +fi + +eval set -- "$options" +conf="/etc/sysconfig/readahead_tune.conf" + +while true; +do + case "$1" in + -h | --help) + usage ;; + -c | --config) + conf="$2" + shift 2 ;; + -- ) + shift + break ;; + * ) + echo "Unexpected option: $1 - this should not happen." + usage ;; + esac +done + +/usr/sbin/readahead_tune -v -c $conf diff --git a/stop_readahead_tune b/stop_readahead_tune new file mode 100755 index 0000000000000000000000000000000000000000..b044ab750eab482595d2046f24ab4c1012a274bb --- /dev/null +++ b/stop_readahead_tune @@ -0,0 +1,2 @@ +#!/bin/bash +kill `pidof readahead_tune` diff --git a/tools/bpftool b/tools/bpftool new file mode 100755 index 0000000000000000000000000000000000000000..c5629886d07cde839e88d7fee8f6d4ae839944dd Binary files /dev/null and b/tools/bpftool differ diff --git a/tools/pahole b/tools/pahole new file mode 100755 index 0000000000000000000000000000000000000000..de2cf5003afd968838f24953e140b328b8f8c3b1 Binary files /dev/null and b/tools/pahole differ