diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8de927ec1911a24b03b768ff750a15fff669021b --- /dev/null +++ b/Makefile @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang -v +BPFTOOL ?= $(abspath tools/bpftool) +PAHOLE ?= $(abspath tools/pahole) +READELF ?= readelf +VMLINUX ?= /usr/lib/debug/lib/modules/`uname -r`/vmlinux +VMLINUX_HEADER ?= $(OUTPUT)/vmlinux.h + +BTF_PAHOLE_PROBE := $(shell $(READELF) -S $(VMLINUX) | grep .BTF 2>&1) +INCLUDES := -I$(OUTPUT) +CFLAGS := -g -Wall +# ARM版本也需要验证下 +ARCH := $(shell uname -m | sed 's/x86_64/x86/') + +ifeq ($(BTF_PAHOLE_PROBE),) + DWARF2BTF = y +endif + +APPS = readahead_tune + +CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +.PHONY: all +all: $(APPS) + +debug: DEBUG_FLAGS = -DBPFDEBUG +debug: all + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(VMLINUX_HEADER): + $(call msg,GEN-VMLINUX_H,$@) +ifeq ($(DWARF2BTF),y) + $(Q)$(PAHOLE) -J $(VMLINUX) +endif + $(Q)$(BPFTOOL) btf dump file $(VMLINUX) format c > $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(wildcard %.h) | $(OUTPUT) $(VMLINUX_HEADER) + $(call msg,BPF,$@) + $(Q)$(CLANG) -D__KERNEL__ -D__ASM_SYSREG_H -D__TARGET_ARCH_$(ARCH) \ + $(DEBUG_FLAGS) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -g -O2 -target bpf -c $(filter %.c,$^) -o $@ + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +# 动态链接的场景下尝试将-lelf和-lz去掉 +$(APPS): %: $(OUTPUT)/%.o | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ -lbpf -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/atune_bpf_collection.spec b/atune_bpf_collection.spec new file mode 100644 index 0000000000000000000000000000000000000000..ac5bf81f978962e60e8299799a26d9c7a7af6fd9 --- /dev/null +++ b/atune_bpf_collection.spec @@ -0,0 +1,41 @@ +Name: A-Tune-BPF-Collection +Version: 0.1 +Release: 1 +License: Mulan PSL v2 +Summary: BPF program collection to adjust fine-grained kernel mode to get better performance +URL: https://gitee.com/openeuler/A-Tune-BPF-Collection +Source0: https://gitee.com/openeuler/A-Tune-BPF-Collection/repository/archive/v%{version}.tar.gz + +BuildRequires: clang, llvm, libbpf-devel +Requires: libbpf, systemd +Provides: readahead_tune + +%define debug_package %{nil} + +%description +A-Tune BPF Collection contains a set of BPF program which can interact with kernel in real time. +It has the following capabilities: +readahead_tune: trace file reading characteristics, then ajust file read mode to get maximum I/O efficency + +%prep +%autosetup -n %{name}-%{version} -p1 + +%build +make %{?_smp_mflags} + +%install +install -D -p -m 0644 readahead_tune.conf %{buildroot}%{_sysconfdir}/sysconfig/readahead_tune.conf +install -D -p -m 0644 readahead_tune.service %{buildroot}/%{_unitdir}/readahead_tune.service +install -D -p -m 0644 readahead_tune %{buildroot}/%{_sbindir}/readahead_tune + +%files +%{_sbindir}/readahead_tune +%{_unitdir}/*.service +%config(noreplace) %{_sysconfdir}/sysconfig/readahead_tune.conf + +%changelog +* Tue Nov 9 2021 lvying - 0.1-1 +- Type:feature +- ID:NA +- SUG:NA +- DESC: Init A-Tune-BPF-Collection repo and add readahead_tune service diff --git a/common_helper.h b/common_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..304f3a1a962f70f3ca614981b62b121337853efa --- /dev/null +++ b/common_helper.h @@ -0,0 +1,26 @@ +#ifndef _COMMON_HELPER_H +#define _COMMON_HELPER_H + +#include +#include +#include +#include +#include + +#define log(level, fmt, args...) do { \ + syslog(level, fmt, ##args); \ +} while (0) + +static inline void bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { + fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); + exit(1); + } +} +#endif diff --git a/readahead_tune.bpf.c b/readahead_tune.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..ad07e73f3d30e7b0e49549912c69aaa72fc1882c --- /dev/null +++ b/readahead_tune.bpf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 OR MulanPSL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#define BPF_NO_PRESERVE_ACCESS_INDEX + +#include "vmlinux.h" +#include +#include "readahead_tune.h" + +/* + * Need to keep consistent with definitions in include/linux/fs.h + * vmlinux.h does not contain macro + */ +#define FMODE_RANDOM 0x1000 +#define FMODE_WILLNEED 0x400000 + +#define PREFIX_PATTERN "blk_" +#define MAX_HASH_TABLE_ENTRY 10000 + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; + +struct fs_file_read_ctx { + const unsigned char *name; + unsigned int f_mode; + unsigned int rsvd; + /* clear from f_mode */ + unsigned int clr_f_mode; + /* set into f_mode */ + unsigned int set_f_mode; + unsigned long key; + /* file size */ + long long i_size; + /* previous page index */ + long long prev_index; + /* current page index */ + long long index; +}; + +struct fs_file_read_args { + struct fs_file_read_ctx *ctx; + int version; +}; + +struct fs_file_release_args { + void *inode; + void *filp; +}; + +struct file_rd_hnode { + __u64 last_nsec; + __u64 seq_nr; + __u64 tot_nr; +}; + +struct bpf_map_def SEC("maps") htab = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct file_rd_hnode), + .max_entries = MAX_HASH_TABLE_ENTRY, +}; + +const volatile unsigned long long file_read_conf[CONF_NUM] = { + DEFAULT_FILESZ, + DEFAULT_READ_TIME, + DEFAULT_TOTAL_READ, + DEFAULT_LOWER_BOUND, + DEFAULT_UPPER_BOUND +}; + +static __always_inline bool is_expected_file(void *name) +{ + char prefix[5]; + int err; + + err = bpf_probe_read_str(&prefix, sizeof(prefix), name); + if (err <= 0) + return false; + return !__builtin_memcmp(prefix, PREFIX_PATTERN, sizeof(PREFIX_PATTERN) - 1); +} + +SEC("raw_tracepoint.w/fs_file_read") +int fs_file_read(struct fs_file_read_args *args) +{ + const char fmt[] = "elapsed %llu, seq %u, tot %u\n"; + struct fs_file_read_ctx *rd_ctx = args->ctx; + + if (!is_expected_file((void *)rd_ctx->name)) + return 0; + + if (rd_ctx->i_size <= file_read_conf[CONF_FILESZ]) { + rd_ctx->set_f_mode = FMODE_WILLNEED; + return 0; + } + + __u64 now = bpf_ktime_get_ns(); + __u64 key = rd_ctx->key; + bool first = false; + struct file_rd_hnode *hist = bpf_map_lookup_elem(&htab, &key); + struct file_rd_hnode new_hist; + if (!hist) { + __builtin_memset(&new_hist, 0, sizeof(new_hist)); + new_hist.last_nsec = now; + first = true; + hist = &new_hist; + } + + /* the consecutive read pos of the same file spatially local */ + if (rd_ctx->index >= rd_ctx->prev_index && + rd_ctx->index - rd_ctx->prev_index <= 1) + hist->seq_nr += 1; + hist->tot_nr += 1; + + bpf_trace_printk(fmt, sizeof(fmt), now - hist->last_nsec, + hist->seq_nr, hist->tot_nr); + + if (first) { + bpf_map_update_elem(&htab, &key, hist, 0); + return 0; + } + + if (now - hist->last_nsec >= file_read_conf[CONF_READ_TIME] || hist->tot_nr >= file_read_conf[CONF_TOTAL_READ]) { + if (hist->tot_nr >= file_read_conf[CONF_TOTAL_READ]) { + if (hist->seq_nr <= hist->tot_nr * file_read_conf[CONF_LOWER_BOUND] / HUNDRED_PERCENTAGE) + rd_ctx->set_f_mode = FMODE_RANDOM; + else if (hist->seq_nr >= hist->tot_nr * file_read_conf[CONF_UPPER_BOUND] / HUNDRED_PERCENTAGE) + rd_ctx->clr_f_mode = FMODE_RANDOM; + } + + hist->last_nsec = now; + hist->tot_nr = 0; + hist->seq_nr = 0; + } + + return 0; +} + +SEC("raw_tracepoint/fs_file_release") +int fs_file_release(struct fs_file_release_args *args) +{ + __u64 key = (unsigned long)args->filp; + void *value; + + value = bpf_map_lookup_elem(&htab, &key); + if (value) + bpf_map_delete_elem(&htab, &key); + + return 0; +} diff --git a/readahead_tune.c b/readahead_tune.c new file mode 100644 index 0000000000000000000000000000000000000000..1b081b53d0b24a7b45d7db3e345b752fc7dd4fc9 --- /dev/null +++ b/readahead_tune.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021. Huawei Technologies Co., Ltd */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "readahead_tune.h" +#include "common_helper.h" +#include "readahead_tune.skel.h" + +struct env_conf { + const char *name; + unsigned long long default_val; +}; + +const struct env_conf confs[CONF_NUM] = { + {"filesz-threshold", DEFAULT_FILESZ}, + {"read-time-threshold", DEFAULT_READ_TIME}, + {"total-read-threshold", DEFAULT_TOTAL_READ}, + {"lower-bound-percentage", DEFAULT_LOWER_BOUND}, + {"upper-bound-percentage", DEFAULT_UPPER_BOUND} +}; + +static void parse_config_env(struct readahead_tune_bpf *skel) +{ + for (int i = 0; i < CONF_NUM; i++) { + char *env_str = getenv(confs[i].name); + if (env_str && strlen(env_str)) { + char *endptr; + errno = 0; + long long val = strtoll(env_str, &endptr, 10); + + if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) + || (errno != 0 && val == 0) + || endptr == env_str + || *endptr != '\0' + || val < 0) { + log(LOG_ERR, "Invalid env %s, use default!\n", env_str); + skel->rodata->file_read_conf[i] = confs[i].default_val; + } else { + skel->rodata->file_read_conf[i] = (unsigned long long)val; + } + } else { + skel->rodata->file_read_conf[i] = confs[i].default_val; + } + } + + if (skel->rodata->file_read_conf[CONF_LOWER_BOUND] >= skel->rodata->file_read_conf[CONF_UPPER_BOUND] + || skel->rodata->file_read_conf[CONF_LOWER_BOUND] <= ZERO_PERCENTAGE + || skel->rodata->file_read_conf[CONF_UPPER_BOUND] >= HUNDRED_PERCENTAGE) { + log(LOG_ERR, "lower-bound-percentage(%llu), upper-bound-percentage(%llu) is invalid, use default\n", + skel->rodata->file_read_conf[CONF_LOWER_BOUND], skel->rodata->file_read_conf[CONF_UPPER_BOUND]); + skel->rodata->file_read_conf[CONF_LOWER_BOUND] = DEFAULT_LOWER_BOUND; + skel->rodata->file_read_conf[CONF_UPPER_BOUND] = DEFAULT_UPPER_BOUND; + } + + if (skel->rodata->file_read_conf[CONF_TOTAL_READ] * skel->rodata->file_read_conf[CONF_LOWER_BOUND] < skel->rodata->file_read_conf[CONF_TOTAL_READ] + || skel->rodata->file_read_conf[CONF_TOTAL_READ] * skel->rodata->file_read_conf[CONF_UPPER_BOUND] < skel->rodata->file_read_conf[CONF_TOTAL_READ]) { + log(LOG_ERR, "total-read-threshold(%llu) is too large, use default\n", skel->rodata->file_read_conf[CONF_TOTAL_READ]); + skel->rodata->file_read_conf[CONF_TOTAL_READ] = DEFAULT_TOTAL_READ; + } + + printf("All the file_read_conf finally set as following:\n"); + for (int i = 0; i < CONF_NUM; i++) + log(LOG_INFO, "Config %s = %llu\n", confs[i].name, skel->rodata->file_read_conf[i]); +} + +bool verbose; +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'v': + verbose = true; + break; + case ARGP_KEY_ARG: + argp_usage(state); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static const struct argp argp = { + .options = opts, + .parser = parse_arg, +}; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +int main(int argc, char *argv[]) +{ + struct readahead_tune_bpf *skel; + + int err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + err = daemon(0, 0); + if (err) { + perror("Failed to daemon\n"); + return err; + } + + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Bump RLIMIT_MEMLOCK to create BPF maps */ + bump_memlock_rlimit(); + + skel = readahead_tune_bpf__open(); + if (!skel) { + log(LOG_ERR, "Failed to open BPF skeleton\n"); + return -1; + } + + parse_config_env(skel); + + err = readahead_tune_bpf__load(skel); + if (err) { + log(LOG_ERR, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } + + err = readahead_tune_bpf__attach(skel); + if (err) { + log(LOG_ERR, "Failed to attach BPF skeleton\n"); + goto cleanup; + } + + pause(); + +cleanup: + readahead_tune_bpf__destroy(skel); + return err; +} diff --git a/readahead_tune.conf b/readahead_tune.conf new file mode 100644 index 0000000000000000000000000000000000000000..953e17ef9a8955aed844b6755b1f70cb15106cfd --- /dev/null +++ b/readahead_tune.conf @@ -0,0 +1,29 @@ +# readahead tune +# Note: Run-time configuration is unsupported, service restart needed. +# Note: this file should be installed at /etc/sysconfig/readahead_tune.conf + +# Specify the threshold of file size. +# BPF program only trace the file whoes file size exceeds threshold +# Default unit is byte, other unit conf is not supported, default is 4MB +filesz-threshold=4194304 + +# Specify the sampling times of file read +# BPF program judges the file reading characteristics based on the sampling times of file read +# and then adjusts the file reading mode +total-read-threshold=10 + +# Specify the time interval threshold between two consecutive readings of the same file +# If consecutive file read exceeds the time interval threshold, BPF program resamples +# the file read to determine the file reading characteristics +# Default unit is ns, other unit conf is not supported, default is 500ms +read-time-threshold=500000000 + +# Specify the lower bound percentage of sequential read ratio +# If sequential read ratio is lower than the lower bound, BPF program will set file read mode FMODE_RANDOM +# the range is (0, 100) +lower-bound-percentage=30 + +# Specify the upper bound percentage of sequential read ratio +# If sequential read ratio is upper than the upper bound, BPF program will clear file read mode FMODE_RANDOM +# the range is (0, 100), upper-bound-percentage > lower-bound-percentage is required +upper-bound-percentage=70 diff --git a/readahead_tune.h b/readahead_tune.h new file mode 100644 index 0000000000000000000000000000000000000000..340af41b62089004a9193afe85e22824228a90fe --- /dev/null +++ b/readahead_tune.h @@ -0,0 +1,21 @@ +#ifndef _READAHEAD_TUNE_H +#define _READAHEAD_TUNE_H + +#define ZERO_PERCENTAGE 0 +#define HUNDRED_PERCENTAGE 100 +#define DEFAULT_FILESZ (4<< 20) +#define DEFAULT_READ_TIME 500000000ULL +#define DEFAULT_TOTAL_READ 10 +#define DEFAULT_LOWER_BOUND 30 +#define DEFAULT_UPPER_BOUND 70 + +enum conf_type { + CONF_FILESZ, + CONF_READ_TIME, + CONF_TOTAL_READ, + CONF_LOWER_BOUND, + CONF_UPPER_BOUND, + CONF_NUM, +}; + +#endif diff --git a/readahead_tune.service b/readahead_tune.service new file mode 100644 index 0000000000000000000000000000000000000000..809eaa415446fe262d862c61eed6eecb291ef7c7 --- /dev/null +++ b/readahead_tune.service @@ -0,0 +1,11 @@ +[Unit] +Description=readahead daemon to trace file read and adjust file read mode +After=syslog.target + +[Service] +EnvironmentFile=/etc/sysconfig/readahead_tune.conf +ExecStart=/usr/sbin/readahead_tune -v +Restart=on-abort + +[Install] +WantedBy=multi-user.target diff --git a/tools/bpftool b/tools/bpftool new file mode 100755 index 0000000000000000000000000000000000000000..c5629886d07cde839e88d7fee8f6d4ae839944dd Binary files /dev/null and b/tools/bpftool differ diff --git a/tools/pahole b/tools/pahole new file mode 100755 index 0000000000000000000000000000000000000000..de2cf5003afd968838f24953e140b328b8f8c3b1 Binary files /dev/null and b/tools/pahole differ