diff --git a/source/tools/monitor/oomkill/Makefile b/source/tools/monitor/oomkill/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..db0b615770a886a5ceefed8ab05005b544c9bd9b --- /dev/null +++ b/source/tools/monitor/oomkill/Makefile @@ -0,0 +1,6 @@ +target := oomkill + +mods := main.o globals.o kill.o meminfo.o msg.o metric.o + +LDFLAGS += -lm +include $(SRC)/mk/csrc.mk diff --git a/source/tools/monitor/oomkill/README.md b/source/tools/monitor/oomkill/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c638c32e4aedd4a99539adb64035e72017488aea --- /dev/null +++ b/source/tools/monitor/oomkill/README.md @@ -0,0 +1,48 @@ +# 功能说明 +用户态oomkill服务 + +# 使用说明 +sysak oomkill +-m PERCENT[,KILL_PERCENT] set available memory minimum to PERCENT of total + (default 10 %). + oomkill sends SIGTERM once below PERCENT, then + SIGKILL once below KILL_PERCENT (default PERCENT/2). + + -M SIZE[,KILL_SIZE] set available memory minimum to SIZE KiB + + -k kill mode: 0, 1, 2, 3 (default 1) + + -i cpu iowait value (default 30) + + -I cpu system value (default 30) + + -n enable d-bus notifications + + -N /PATH/TO/SCRIPT call script after oom kill + + -g kill all processes within a process group + + -d enable debugging messages + + -v print version information and exit + + -r INTERVAL memory report interval in seconds (default 1), set + to 0 to disable completely + + -p set niceness of oomkill to -20 and oom_score_adj to + -100 + + --ignore-root-user do not kill processes owned by root + + --prefer REGEX prefer to kill processes matching REGEX + + --avoid REGEX avoid killing processes matching REGEX + + --ignore REGEX ignore processes matching REGEX + + --dryrun dry run (do not kill any processes) + + -h, --help this help text + + + diff --git a/source/tools/monitor/oomkill/globals.c b/source/tools/monitor/oomkill/globals.c new file mode 100644 index 0000000000000000000000000000000000000000..99424d97484cc84a4cd86d1d1dfaef3521c29f88 --- /dev/null +++ b/source/tools/monitor/oomkill/globals.c @@ -0,0 +1,5 @@ +int enable_debug = 0; + +// This variable exists so the tests can point +// it to a mockup proc dir +char* procdir_path = "/proc"; diff --git a/source/tools/monitor/oomkill/globals.h b/source/tools/monitor/oomkill/globals.h new file mode 100644 index 0000000000000000000000000000000000000000..807c860406ef1f812e6747028c8035d74d088f07 --- /dev/null +++ b/source/tools/monitor/oomkill/globals.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef GLOBALS_H +#define GLOBALS_H + +extern int enable_debug; + +extern char* procdir_path; + +#endif diff --git a/source/tools/monitor/oomkill/kill.c b/source/tools/monitor/oomkill/kill.c new file mode 100644 index 0000000000000000000000000000000000000000..d094dbc92f98216106abcc6b10aa5b036701aa9d --- /dev/null +++ b/source/tools/monitor/oomkill/kill.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: MIT + +/* Kill the most memory-hungy process */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* Definition of SYS_* constants */ +#include +#include + +#include "globals.h" +#include "kill.h" +#include "meminfo.h" +#include "msg.h" + +// Processes matching "--prefer REGEX" get BADNESS_PREFER added to their badness +#define BADNESS_PREFER 300 +// Processes matching "--avoid REGEX" get BADNESS_AVOID added to their badness +#define BADNESS_AVOID -300 + +// Buffer size for UID/GID/PID string conversion +#define UID_BUFSIZ 128 +// At most 1 notification per second when --dryrun is active +#define NOTIFY_RATELIMIT 1 + +static bool isnumeric(char* str) +{ + int i = 0; + + // Empty string is not numeric + if (str[0] == 0) + return false; + + while (1) { + if (str[i] == 0) // End of string + return true; + + if (isdigit(str[i]) == 0) + return false; + + i++; + } +} + +static void notify_dbus(const char* summary, const char* body) +{ + int pid = fork(); + if (pid > 0) { + // parent + return; + } + char summary2[1024] = { 0 }; + snprintf(summary2, sizeof(summary2), "string:%s", summary); + char body2[1024] = "string:"; + if (body != NULL) { + snprintf(body2, sizeof(body2), "string:%s", body); + } + // Complete command line looks like this: + // dbus-send --system / net.nuetzlich.SystemNotifications.Notify 'string:summary text' 'string:and body text' + execl("/usr/bin/dbus-send", "dbus-send", "--system", "/", "net.nuetzlich.SystemNotifications.Notify", + summary2, body2, NULL); + warn("%s: exec failed: %s\n", __func__, strerror(errno)); + exit(1); +} + +static void notify_ext(const char* script, const procinfo_t* victim) +{ + pid_t pid1 = fork(); + + if (pid1 == -1) { + warn("notify_ext: fork() returned -1: %s\n", strerror(errno)); + return; + } else if (pid1 != 0) { + return; + } + + char pid_str[UID_BUFSIZ] = { 0 }; + char uid_str[UID_BUFSIZ] = { 0 }; + + snprintf(pid_str, UID_BUFSIZ, "%d", victim->pid); + snprintf(uid_str, UID_BUFSIZ, "%d", victim->uid); + + setenv("EARLYOOM_PID", pid_str, 1); + setenv("EARLYOOM_UID", uid_str, 1); + setenv("EARLYOOM_NAME", victim->name, 1); + + execl(script, script, NULL); + warn("%s: exec %s failed: %s\n", __func__, script, strerror(errno)); + exit(1); +} + +static void notify_process_killed(const poll_loop_args_t* args, const procinfo_t* victim) +{ + // Dry run can cause the notify function to be called on each poll as + // nothing is immediately done to change the situation we don't know how + // heavy the notify script is so avoid spamming it + if (args->dryrun) { + static struct timespec prev_notify = { 0 }; + struct timespec cur_time = { 0 }; + + int ret = clock_gettime(CLOCK_MONOTONIC, &cur_time); + if (ret == -1) { + warn("%s: clock_gettime failed: %s\n", __func__, strerror(errno)); + return; + } + // Ignores nanoseconds, but good enough here + if (cur_time.tv_sec - prev_notify.tv_sec < NOTIFY_RATELIMIT) { + // Too soon + debug("%s: rate limit hit, skipping notifications this time\n", __func__); + return; + } + prev_notify = cur_time; + } + + if (args->notify) { + char notif_args[PATH_MAX + 1000]; + snprintf(notif_args, sizeof(notif_args), + "Low memory! Killing process %d %s", victim->pid, victim->name); + notify_dbus("oomkill", notif_args); + } + if (args->notify_ext) { + notify_ext(args->notify_ext, victim); + } +} + +#if defined(__NR_pidfd_open) && defined(__NR_process_mrelease) +void mrelease(const pid_t pid) +{ + int pidfd = (int)syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + // can happen if process has already exited + debug("mrelease: pid %d: error opening pidfd: %s\n", pid, strerror(errno)); + return; + } + int res = (int)syscall(__NR_process_mrelease, pidfd, 0); + if (res != 0) { + warn("mrelease: pid=%d pidfd=%d failed: %s\n", pid, pidfd, strerror(errno)); + } else { + debug("mrelease: pid=%d pidfd=%d success\n", pid, pidfd); + } +} +#else +void mrelease(__attribute__((unused)) const pid_t pid) +{ + debug("mrelease: process_mrelease() and/or pidfd_open() not available\n"); +} +#ifndef __NR_pidfd_open +#warning "__NR_pidfd_open is undefined, cannot use process_mrelease" +#endif +#ifndef __NR_process_mrelease +#warning "__NR_process_mrelease is undefined, cannot use process_mrelease" +#endif +#endif + +/* + * Send the selected signal to "pid" and wait for the process to exit + * (max 10 seconds) + */ +int kill_wait(const poll_loop_args_t* args, pid_t pid, int sig) +{ + int pidfd = -1; + + if (args->dryrun && sig != 0) { + warn("dryrun, not actually sending any signal\n"); + return 0; + } + const unsigned poll_ms = 100; + if (args->kill_process_group) { + int res = getpgid(pid); + if (res < 0) { + return res; + } + pid = -res; + warn("killing whole process group %d (-g flag is active)\n", res); + } + +#if defined(__NR_pidfd_open) && defined(__NR_process_mrelease) + // Open the pidfd *before* calling kill(). + // Otherwise process_mrelease() fails in 50% of cases with ESRCH. + if (!args->kill_process_group && sig != 0) { + pidfd = (int)syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + warn("%s pid %d: error opening pidfd: %s\n", __func__, pid, strerror(errno)); + } + } +#endif + + int res = kill(pid, sig); + if (res != 0) { + if (pidfd >= 0) { + close(pidfd); + } + return res; + } + /* signal 0 does not kill the process. Don't wait for it to exit */ + if (sig == 0) { + return 0; + } + + struct timespec t0 = { 0 }; + clock_gettime(CLOCK_MONOTONIC, &t0); + +#if defined(__NR_pidfd_open) && defined(__NR_process_mrelease) + // Call the process_mrelease() syscall to release all the memory of + // the killed process as quickly as possible - see https://lwn.net/Articles/864184/ + // for details. + if (pidfd >= 0) { + int res = (int)syscall(__NR_process_mrelease, pidfd, 0); + if (res != 0) { + warn("%s pid=%d: process_mrelease pidfd=%d failed: %s\n", __func__, pid, pidfd, strerror(errno)); + } else { + debug("%s pid=%d: process_mrelease pidfd=%d success\n", __func__, pid, pidfd); + } + close(pidfd); + } +#endif + + for (unsigned i = 0; i < 100; i++) { + struct timespec t1 = { 0 }; + clock_gettime(CLOCK_MONOTONIC, &t1); + float secs = (float)(t1.tv_sec - t0.tv_sec) + (float)(t1.tv_nsec - t0.tv_nsec) / (float)1e9; + + // We have sent SIGTERM but now have dropped below SIGKILL limits. + // Escalate to SIGKILL. + if (sig != SIGKILL) { + meminfo_t m = parse_meminfo(); + print_mem_stats(debug, m); + if (m.MemAvailablePercent <= args->mem_kill_percent && m.SwapFreePercent <= args->swap_kill_percent) { + sig = SIGKILL; + res = kill(pid, sig); + // kill first, print after + warn("escalating to SIGKILL after %.1f seconds\n", secs); + if (res != 0) { + return res; + } + } + } else if (enable_debug) { + meminfo_t m = parse_meminfo(); + print_mem_stats(printf, m); + } + if (!is_alive(pid)) { + warn("process %d exited after %.3f seconds\n", pid, secs); + return 0; + } + struct timespec req = { .tv_sec = (time_t)(poll_ms / 1000), .tv_nsec = (poll_ms % 1000) * 1000000 }; + nanosleep(&req, NULL); + } + errno = ETIME; + return -1; +} + +static get_oom_points(const poll_loop_args_t* args, long *points, procinfo_t* cur) +{ + long rss = 0; + int adj = 0;; + + get_oom_score_adj(cur->pid, &adj); + rss = get_vm_rss_kib(cur->pid); + + if (adj == -1000 || rss < 1024) + return false; + + cur->oom_score_adj = adj; + cur->VmRSSkiB = rss; + + if ((args->prefer_regex || args->avoid_regex || args->ignore_regex)) { + int res = get_comm(cur->pid, cur->name, sizeof(cur->name)); + if (res < 0) { + debug("pid %d: error reading process name: %s\n", cur->pid, strerror(-res)); + return false; + } + if (args->prefer_regex && regexec(args->prefer_regex, cur->name, (size_t)0, NULL, 0) == 0) { + adj += BADNESS_PREFER; + } + if (args->avoid_regex && regexec(args->avoid_regex, cur->name, (size_t)0, NULL, 0) == 0) { + adj += BADNESS_AVOID; + } + if (args->ignore_regex && regexec(args->ignore_regex, cur->name, (size_t)0, NULL, 0) == 0) { + return false; + } + } + + rss += adj * (args->m.MemTotalKiB/1000); + *points = rss; + return true; +} +// is_larger finds out if the process with pid `cur->pid` uses more memory +// than our current `victim`. +// In the process, it fills the `cur` structure. It does so lazily, meaning +// it only fills the fields it needs to make a decision. +bool is_larger(const poll_loop_args_t* args, const procinfo_t* victim, procinfo_t* cur) +{ + long points = 0; + + if (cur->pid <= 1) { + // Let's not kill init. + return false; + } + + { + int res = get_uid(cur->pid); + if (res < 0) { + debug("pid %d: error reading uid: %s\n", cur->pid, strerror(-res)); + return false; + } + cur->uid = res; + } + if (cur->uid == 0 && args->ignore_root_user) { + // Ignore processes owned by root user. + return false; + } + + { + int res = get_oom_points(args, &points, cur); + if (res < 0) { + debug("pid %d: error reading oom_score: %s\n", cur->pid, strerror(-res)); + return false; + } + cur->badness = points; + } + if (cur->badness < victim->badness) { + return false; + } + if (cur->badness == victim->badness && cur->VmRSSkiB <= victim->VmRSSkiB) { + return false; + } + + // Looks like we have a new victim. Fill out remaining fields + if (strlen(cur->name) == 0) { + int res = get_comm(cur->pid, cur->name, sizeof(cur->name)); + if (res < 0) { + debug("pid %d: error reading process name: %s\n", cur->pid, strerror(-res)); + return false; + } + } + return true; +} + +// debug_print_procinfo pretty-prints the process information in `cur`. +void debug_print_procinfo(const procinfo_t* cur) +{ + if (!enable_debug) { + return; + } + debug("pid %5d: badness %3ld VmRSS %7lld uid %4d oom_score_adj %4d \"%s\"", + cur->pid, cur->badness, cur->VmRSSkiB, cur->uid, cur->oom_score_adj, cur->name); +} + +/* + * Find the process with the largest oom_score. + */ +procinfo_t find_largest_process(const poll_loop_args_t* args) +{ + DIR* procdir = opendir(procdir_path); + if (procdir == NULL) { + fatal(5, "%s: could not open /proc: %s", __func__, strerror(errno)); + } + + struct timespec t0 = { 0 }, t1 = { 0 }; + if (enable_debug) { + clock_gettime(CLOCK_MONOTONIC, &t0); + } + + procinfo_t victim = { 0 }; + while (1) { + errno = 0; + struct dirent* d = readdir(procdir); + if (d == NULL) { + if (errno != 0) + warn("%s: readdir error: %s", __func__, strerror(errno)); + break; + } + + // proc contains lots of directories not related to processes, + // skip them + if (!isnumeric(d->d_name)) + continue; + + procinfo_t cur = { + .pid = (int)strtol(d->d_name, NULL, 10), + .uid = -1, + .badness = -1, + .VmRSSkiB = -1, + .oom_score_adj = -1, + /* omitted fields are set to zero */ + }; + + bool larger = is_larger(args, &victim, &cur); + + debug_print_procinfo(&cur); + + if (larger) { + debug(" <--- new victim\n"); + victim = cur; + } else { + debug("\n"); + } + } + closedir(procdir); + + if (enable_debug) { + clock_gettime(CLOCK_MONOTONIC, &t1); + long delta = (t1.tv_sec - t0.tv_sec) * 1000000 + (t1.tv_nsec - t0.tv_nsec) / 1000; + debug("selecting victim took %ld.%03ld ms\n", delta / 1000, delta % 1000); + } + + if (victim.pid == getpid()) { + warn("%s: selected myself (pid %d). Do you use hidpid? See https://github.com/rfjakob/oomkill/wiki/proc-hidepid\n", + __func__, victim.pid); + // zero victim struct + victim = (const procinfo_t) { 0 }; + } + + return victim; +} + +/* + * Kill the victim process, wait for it to exit, send a gui notification + * (if enabled). + */ +void kill_process(const poll_loop_args_t* args, int sig, const procinfo_t* victim) +{ + if (victim->pid <= 0) { + warn("Could not find a process to kill. Sleeping 1 second.\n"); + if (args->notify) { + notify_dbus("oomkill", "Error: Could not find a process to kill. Sleeping 1 second."); + } + sleep(1); + return; + } + + char* sig_name = "?"; + if (sig == SIGTERM) { + sig_name = "SIGTERM"; + } else if (sig == SIGKILL) { + sig_name = "SIGKILL"; + } else if (sig == 0) { + sig_name = "0 (no-op signal)"; + } + // sig == 0 is used as a self-test during startup. Don't notify the user. + if (sig != 0 || enable_debug) { + warn("sending %s to process %d uid %d \"%s\": badness %ld, VmRSS %lld MiB adj:%d\n", + sig_name, victim->pid, victim->uid, victim->name, victim->badness, victim->VmRSSkiB / 1024, victim->oom_score_adj); + } + + int res = kill_wait(args, victim->pid, sig); + int saved_errno = errno; + + // Send the GUI notification AFTER killing a process. This makes it more likely + // that there is enough memory to spawn the notification helper. + if (sig != 0) { + notify_process_killed(args, victim); + } + + if (sig == 0) { + return; + } + + if (res != 0) { + warn("kill failed: %s\n", strerror(saved_errno)); + if (args->notify) { + notify_dbus("oomkill", "Error: Failed to kill process"); + } + // Killing the process may have failed because we are not running as root. + // In that case, trying again in 100ms will just yield the same error. + // Throttle ourselves to not spam the log. + if (saved_errno == EPERM) { + warn("sleeping 1 second\n"); + sleep(1); + } + } +} diff --git a/source/tools/monitor/oomkill/kill.h b/source/tools/monitor/oomkill/kill.h new file mode 100644 index 0000000000000000000000000000000000000000..89c8c3f75391fec0120bfe04969aad1619a47aec --- /dev/null +++ b/source/tools/monitor/oomkill/kill.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef KILL_H +#define KILL_H + +#include +#include +#include +#include +#include "meminfo.h" +#include "metric.h" + +#define KILL_MODE_0 (0) +#define KILL_MODE_1 (1) +#define KILL_MODE_2 (2) +#define KILL_MODE_3 (3) + +typedef struct { + /* if the available memory AND swap goes below these percentages, + * we start killing processes */ + double mem_term_percent; + double mem_kill_percent; + double swap_term_percent; + double swap_kill_percent; + /* send d-bus notifications? */ + bool notify; + /* Path to script for programmatic notifications (or NULL) */ + char* notify_ext; + /* kill all processes within a process group */ + bool kill_process_group; + /* do not kill processes owned by root */ + bool ignore_root_user; + /* prefer/avoid killing these processes. NULL = no-op. */ + regex_t* prefer_regex; + regex_t* avoid_regex; + /* will ignore these processes. NULL = no-op. */ + regex_t* ignore_regex; + /* memory report interval, in milliseconds */ + int report_interval_ms; + /* Flag --dryrun was passed */ + bool dryrun; + struct cpu_stat cstat_prev; + struct cpu_util cstat_util; + meminfo_t m; + memstatus mode; + int kill_mode; + long min; + long low; + long high; + int iowait_thres; + int sys_thres; + int poll_fd; + int eventc_fd; + int pressure_fd; + struct pollfd pfd; +} poll_loop_args_t; + +struct kill_args { + long iowait_avg; + long iowait_thres; + long sys_avg; + long sys_thres; + long kill_mode; +}; + +void kill_process(const poll_loop_args_t* args, int sig, const procinfo_t* victim); +procinfo_t find_largest_process(const poll_loop_args_t* args); + +#endif diff --git a/source/tools/monitor/oomkill/main.c b/source/tools/monitor/oomkill/main.c new file mode 100644 index 0000000000000000000000000000000000000000..98401f556b4ba43aa9f8c33c7ffa0ea83f11fb3c --- /dev/null +++ b/source/tools/monitor/oomkill/main.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: MIT + +/* Check available memory and swap in a loop and start killing + * processes if they get too low */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "globals.h" +#include "kill.h" +#include "meminfo.h" +#include "msg.h" + +/* Don't fail compilation if the user has an old glibc that + * does not define MCL_ONFAULT. The kernel may still be recent + * enough to support the flag. + */ +#ifndef MCL_ONFAULT +#define MCL_ONFAULT 4 +#endif + +#ifndef VERSION +#define VERSION "*** v1.0 version ***" +#endif + +/* Arbitrary identifiers for long options that do not have a short + * version */ +enum { + LONG_OPT_PREFER = 513, + LONG_OPT_AVOID, + LONG_OPT_DRYRUN, + LONG_OPT_IGNORE, + LONG_OPT_IGNORE_ROOT, +}; + +static int set_oom_score_adj(int); +static void poll_loop(poll_loop_args_t* args); +extern int metric_init(poll_loop_args_t *poll); +extern int get_cpu_stat(poll_loop_args_t *poll); +extern int metric_exit(poll_loop_args_t *poll); +extern int event_poll(poll_loop_args_t *poll, int timeout); +// Prevent Golang / Cgo name collision when the test suite runs - +// Cgo generates it's own main function. +#ifdef CGO +#define main main2 +#endif + +double min(double x, double y) +{ + if (x < y) + return x; + return y; +} + +// Dry-run oom kill to make sure that +// (1) it works (meaning /proc is accessible) +// (2) the stack grows to maximum size before calling mlockall() +static void startup_selftests(poll_loop_args_t* args) +{ + { + debug("%s: dry-running oom kill...\n", __func__); + procinfo_t victim = find_largest_process(args); + kill_process(args, 0, &victim); + } + if (args->notify_ext) { + if (args->notify_ext[0] != '/') { + warn("%s: -N: notify script '%s' is not an absolute path, disabling -N\n", __func__, args->notify_ext); + args->notify_ext = NULL; + } else if (access(args->notify_ext, X_OK)) { + warn("%s: -N: notify script '%s' is not executable: %s\n", __func__, args->notify_ext, strerror(errno)); + } + } +} + +int main(int argc, char* argv[]) +{ + poll_loop_args_t args = { + .mem_term_percent = 7, + .swap_term_percent = 10, + .mem_kill_percent = 5, + .swap_kill_percent = 5, + .report_interval_ms = 10000, + .iowait_thres = 30, + .sys_thres = 40, + .ignore_root_user = false, + .kill_mode = KILL_MODE_1, + /* omitted fields are set to zero */ + }; + int set_my_priority = 1; + char* prefer_cmds = NULL; + char* avoid_cmds = NULL; + char* ignore_cmds = NULL; + regex_t _prefer_regex; + regex_t _avoid_regex; + regex_t _ignore_regex; + + /* request line buffering for stdout - otherwise the output + * may lag behind stderr */ + setlinebuf(stdout); + + /* clean up dbus-send zombies */ + signal(SIGCHLD, SIG_IGN); + + fprintf(stderr, "oomkill " VERSION "\n"); + + if (chdir(procdir_path) != 0) { + fatal(4, "Could not cd to /proc: %s", strerror(errno)); + } + + // PR_CAP_AMBIENT is not available on kernel < 4.3 +#ifdef PR_CAP_AMBIENT + // When systemd starts a daemon with capabilities, it uses ambient + // capabilities to do so. If not dropped, the capabilities can spread + // to any child process. This is usually not necessary and its a good + // idea to drop them if not needed. + prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0); +#endif + + meminfo_t m = parse_meminfo(); + + int c; + const char* short_opt = "m:k:i:I:f:s:M:S:ngN:dvr:ph"; + struct option long_opt[] = { + { "prefer", required_argument, NULL, LONG_OPT_PREFER }, + { "avoid", required_argument, NULL, LONG_OPT_AVOID }, + { "ignore", required_argument, NULL, LONG_OPT_IGNORE }, + { "dryrun", no_argument, NULL, LONG_OPT_DRYRUN }, + { "ignore-root-user", no_argument, NULL, LONG_OPT_IGNORE_ROOT }, + { "help", no_argument, NULL, 'h' }, + { 0, 0, NULL, 0 } /* end-of-array marker */ + }; + bool have_m = 0, have_M = 0, have_s = 0, have_S = 0; + double mem_term_kib = 0, mem_kill_kib = 0, swap_term_kib = 0, swap_kill_kib = 0; + + while ((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { + float report_interval_f = 0; + term_kill_tuple_t tuple; + + switch (c) { + case -1: /* no more arguments */ + case 0: /* long option toggles */ + break; + case 'm': + // Use 99 as upper limit. Passing "-m 100" makes no sense. + tuple = parse_term_kill_tuple(optarg, 99); + if (strlen(tuple.err)) { + fatal(15, "-m: %s", tuple.err); + } + args.mem_term_percent = tuple.term; + args.mem_kill_percent = tuple.kill; + have_m = 1; + break; + case 's': + // Using "-s 100" is a valid way to ignore swap usage + tuple = parse_term_kill_tuple(optarg, 100); + if (strlen(tuple.err)) { + fatal(16, "-s: %s", tuple.err); + } + args.swap_term_percent = tuple.term; + args.swap_kill_percent = tuple.kill; + have_s = 1; + break; + case 'M': + tuple = parse_term_kill_tuple(optarg, m.MemTotalKiB * 100 / 99); + if (strlen(tuple.err)) { + fatal(15, "-M: %s", tuple.err); + } + mem_term_kib = tuple.term; + mem_kill_kib = tuple.kill; + have_M = 1; + break; + case 'k': + args.kill_mode = strtol(optarg, NULL, 10); + printf("kill mode:%d\n", args.kill_mode); + break; + case 'S': + tuple = parse_term_kill_tuple(optarg, m.SwapTotalKiB * 100 / 99); + if (strlen(tuple.err)) { + fatal(16, "-S: %s", tuple.err); + } + if (m.SwapTotalKiB == 0) { + warn("warning: -S: total swap is zero, using default percentages\n"); + break; + } + swap_term_kib = tuple.term; + break; + case 'i': + args.iowait_thres = strtol(optarg, NULL, 10); + printf("iowait_thres:%ld\n", args.iowait_thres); + break; + case 'I': + args.sys_thres = strtol(optarg, NULL, 10); + printf("sys_thres:%ld\n", args.sys_thres); + break; + case 'n': + args.notify = true; + fprintf(stderr, "Notifying through D-Bus\n"); + break; + case 'g': + args.kill_process_group = true; + break; + case 'N': + args.notify_ext = optarg; + break; + case 'd': + enable_debug = 1; + break; + case 'v': + // The version has already been printed above + exit(0); + case 'r': + report_interval_f = strtof(optarg, NULL); + if (report_interval_f >= 1) { + args.report_interval_ms = (int)(report_interval_f * 1000); + } else { + warn("-r: invalid interval '%s' ,need to > 1s\n", optarg); + } + + break; + case 'p': + set_my_priority = 1; + break; + case LONG_OPT_IGNORE_ROOT: + args.ignore_root_user = true; + fprintf(stderr, "Processes owned by root will not be killed\n"); + break; + case LONG_OPT_PREFER: + prefer_cmds = optarg; + break; + case LONG_OPT_AVOID: + avoid_cmds = optarg; + break; + case LONG_OPT_DRYRUN: + warn("dryrun mode enabled, will not kill anything\n"); + args.dryrun = 1; + break; + case LONG_OPT_IGNORE: + ignore_cmds = optarg; + break; + case 'h': + fprintf(stderr, + "Usage: %s [OPTION]...\n" + "\n" + " -m PERCENT[,KILL_PERCENT] set available memory minimum to PERCENT of total\n" + " (default 10 %%).\n" + " oomkill sends SIGTERM once below PERCENT, then\n" + " SIGKILL once below KILL_PERCENT (default PERCENT/2).\n" + " -s PERCENT[,KILL_PERCENT] set free swap minimum to PERCENT of total (default\n" + " 10 %%).\n" + " Note: both memory and swap must be below minimum for\n" + " oomkill to act.\n" + " -M SIZE[,KILL_SIZE] set available memory minimum to SIZE KiB\n" + " -S SIZE[,KILL_SIZE] set free swap minimum to SIZE KiB\n" + " -k kill mode: 0, 1, 2, 3 (default 1)\n" + " -i cpu iowait value (default 30)\n" + " -I cpu system value (default 30)\n" + " -n enable d-bus notifications\n" + " -N /PATH/TO/SCRIPT call script after oom kill\n" + " -g kill all processes within a process group\n" + " -d enable debugging messages\n" + " -v print version information and exit\n" + " -r INTERVAL memory report interval in seconds (default 1), set\n" + " to 0 to disable completely\n" + " -p set niceness of oomkill to -20 and oom_score_adj to\n" + " -100\n" + " --ignore-root-user do not kill processes owned by root\n" + " --prefer REGEX prefer to kill processes matching REGEX\n" + " --avoid REGEX avoid killing processes matching REGEX\n" + " --ignore REGEX ignore processes matching REGEX\n" + " --dryrun dry run (do not kill any processes)\n" + " -h, --help this help text\n", + argv[0]); + exit(0); + case '?': + fprintf(stderr, "Try 'oomkill --help' for more information.\n"); + exit(13); + } + } /* while getopt */ + + // Merge "-M" with "-m" values + if (have_M) { + double M_term_percent = 100 * mem_term_kib / (double)m.MemTotalKiB; + double M_kill_percent = 100 * mem_kill_kib / (double)m.MemTotalKiB; + if (have_m) { + // Both -m and -M were passed. Use the lower of both values. + args.mem_term_percent = min(args.mem_term_percent, M_term_percent); + args.mem_kill_percent = min(args.mem_kill_percent, M_kill_percent); + } else { + // Only -M was passed. + args.mem_term_percent = M_term_percent; + args.mem_kill_percent = M_kill_percent; + } + } + // Merge "-S" with "-s" values + if (have_S) { + double S_term_percent = 100 * swap_term_kib / (double)m.SwapTotalKiB; + double S_kill_percent = 100 * swap_kill_kib / (double)m.SwapTotalKiB; + if (have_s) { + // Both -s and -S were passed. Use the lower of both values. + args.swap_term_percent = min(args.swap_term_percent, S_term_percent); + args.swap_kill_percent = min(args.swap_kill_percent, S_kill_percent); + } else { + // Only -S was passed. + args.swap_term_percent = S_term_percent; + args.swap_kill_percent = S_kill_percent; + } + } + if (prefer_cmds) { + args.prefer_regex = &_prefer_regex; + if (regcomp(args.prefer_regex, prefer_cmds, REG_EXTENDED | REG_NOSUB) != 0) { + fatal(6, "could not compile regexp '%s'\n", prefer_cmds); + } + fprintf(stderr, "Preferring to kill process names that match regex '%s'\n", prefer_cmds); + } + if (avoid_cmds) { + args.avoid_regex = &_avoid_regex; + if (regcomp(args.avoid_regex, avoid_cmds, REG_EXTENDED | REG_NOSUB) != 0) { + fatal(6, "could not compile regexp '%s'\n", avoid_cmds); + } + fprintf(stderr, "Will avoid killing process names that match regex '%s'\n", avoid_cmds); + } + if (ignore_cmds) { + args.ignore_regex = &_ignore_regex; + if (regcomp(args.ignore_regex, ignore_cmds, REG_EXTENDED | REG_NOSUB) != 0) { + fatal(6, "could not compile regexp '%s'\n", ignore_cmds); + } + fprintf(stderr, "Will ignore process names that match regex '%s'\n", ignore_cmds); + } + if (set_my_priority) { + bool fail = 0; + if (setpriority(PRIO_PROCESS, 0, -20) != 0) { + warn("Could not set priority: %s. Continuing anyway\n", strerror(errno)); + fail = 1; + } + int ret = set_oom_score_adj(-100); + if (ret != 0) { + warn("Could not set oom_score_adj: %s. Continuing anyway\n", strerror(ret)); + fail = 1; + } + if (!fail) { + fprintf(stderr, "Priority was raised successfully\n"); + } + } + + startup_selftests(&args); + + int err = mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT); + // kernels older than 4.4 don't support MCL_ONFAULT. Retry without it. + if (err != 0) { + err = mlockall(MCL_CURRENT | MCL_FUTURE); + } + if (err != 0) { + perror("Could not lock memory - continuing anyway"); + } + + metric_init(&args); + // Jump into main poll loop + poll_loop(&args); + + metric_exit(&args); + return 0; +} + +// Returns errno (success = 0) +static int set_oom_score_adj(int oom_score_adj) +{ + char buf[PATH_LEN] = { 0 }; + pid_t pid = getpid(); + + snprintf(buf, sizeof(buf), "%s/%d/oom_score_adj", procdir_path, pid); + FILE* f = fopen(buf, "w"); + if (f == NULL) { + return -1; + } + + // fprintf returns a negative error code on failure + int ret1 = fprintf(f, "%d", oom_score_adj); + // fclose returns a non-zero value on failure and errno contains the error code + int ret2 = fclose(f); + + if (ret1 < 0) { + return -ret1; + } + if (ret2) { + return errno; + } + return 0; +} + +/* Calculate the time we should sleep based upon how far away from the memory and swap + * limits we are (headroom). Returns a millisecond value between 100 and 1000 (inclusive). + * The idea is simple: if memory and swap can only fill up so fast, we know how long we can sleep + * without risking to miss a low memory event. + */ +static unsigned sleep_time_ms(const poll_loop_args_t* args, const meminfo_t* m) +{ + // Maximum expected memory/swap fill rate. In kiB per millisecond ==~ MiB per second. + const long long mem_fill_rate = 6000; // 6000MiB/s seen with "stress -m 4 --vm-bytes 4G" + const long long swap_fill_rate = 800; // 800MiB/s seen with membomb on ZRAM + // Clamp calculated value to this range (milliseconds) + const unsigned min_sleep = 100; + const unsigned max_sleep = 1000; + + long long mem_headroom_kib = (long long)((m->MemAvailablePercent - args->mem_term_percent) * (double)m->UserMemTotalKiB / 100); + if (mem_headroom_kib < 0) { + mem_headroom_kib = 0; + } + long long swap_headroom_kib = (long long)((m->SwapFreePercent - args->swap_term_percent) * (double)m->SwapTotalKiB / 100); + if (swap_headroom_kib < 0) { + swap_headroom_kib = 0; + } + long long ms = mem_headroom_kib / mem_fill_rate + swap_headroom_kib / swap_fill_rate; + if (ms < min_sleep) { + return min_sleep; + } + if (ms > max_sleep) { + return max_sleep; + } + return (unsigned)ms; +} + +/* lowmem_sig compares the limits with the current memory situation + * and returns which signal (SIGKILL, SIGTERM, 0) should be sent in + * response. 0 means that there is enough memory and we should + * not kill anything. + */ + +static int get_kill_args(const poll_loop_args_t* args, struct kill_args *thres, int fast) +{ + long iowait_avg = 0; + long iowait_thres = 0; + long sys_thres = 0; + long sys_avg = 0; + int ret = 0; + int kill_mode = 0; + + kill_mode = args->kill_mode; + switch (kill_mode) { + case KILL_MODE_0: + memset(thres, 0, sizeof(*thres)); + break; + case KILL_MODE_2: + iowait_avg = args->cstat_util.iowait_avg30; + sys_avg = args->cstat_util.system_avg30; + break; + case KILL_MODE_3: + iowait_avg = args->cstat_util.iowait_avg60; + sys_avg = args->cstat_util.system_avg60; + break; + default: + iowait_avg = args->cstat_util.iowait_avg10; + sys_avg = args->cstat_util.system_avg10; + } + + if (fast) { + iowait_thres = args->iowait_thres - 10; + sys_thres = args->sys_thres - 10; + } else { + iowait_thres = args->iowait_thres + 10; + sys_thres = args->sys_thres + 10; + } + + thres->iowait_avg = iowait_avg; + thres->iowait_thres = iowait_thres; + thres->sys_avg = sys_avg; + thres->sys_thres = sys_thres; + thres->kill_mode = kill_mode; + + return ret; +} + +static int high_iowait(const poll_loop_args_t* args, struct kill_args *thres) +{ + int ret = 0; + + if (thres->kill_mode == KILL_MODE_0) + return (args->cstat_util.iowait > (args->iowait_thres - 15)); + + else + return !!((thres->iowait_avg >= thres->iowait_thres) && (args->cstat_util.iowait > args->iowait_thres)); +} + +static int high_system(const poll_loop_args_t* args, struct kill_args *thres) +{ + int ret = 0; + + if (thres->kill_mode = KILL_MODE_0) + return (args->cstat_util.system >= (args->sys_thres - 15)); + else + return !!((thres->sys_avg >= thres->sys_thres) && (args->cstat_util.system >= args->sys_thres)); + return ret; +} + +static int low_mem(const poll_loop_args_t* args, const meminfo_t* m, int fast, struct kill_mode_args *thres) +{ + long percent = 0; + long size = 0; + int ret = 0; + if (fast) { + size = WARN_KSIZE; + percent = args->mem_term_percent; + } else { + size = KILL_KSIZE; + percent = args->mem_kill_percent; + } + ret = !!((m->MemAvailablePercent <= percent) && (m->MemAvailableKiB <= size)); + return ret; +} + +static int low_cache(const poll_loop_args_t* args, const meminfo_t* m) +{ + int ret ; + + if (args->kill_mode == KILL_MODE_0) + return 1; + + ret = !!((m->MemFileCacheKiB <= m->MemTotalKiB*KILL_CACHE_RATE) && (m->MemFileCacheKiB<= KILL_CACHE_KSIZE)); + return ret; +} + +static int lowmem_sig(const poll_loop_args_t* args, const meminfo_t* m) +{ + struct kill_args thres; + + memset(&thres, 0, sizeof(thres)); + get_kill_args(args, &thres, 0); + if (low_mem(args,m, 0, &thres) && (high_iowait(args, &thres) || high_system(args, &thres)) && low_cache(args, m)) { + return SIGKILL; + } + + memset(&thres, 0, sizeof(thres)); + get_kill_args(args, &thres, 1); + if (low_mem(args,m, 1, &thres) && (high_iowait(args, &thres) || high_system(args, &thres)) && low_cache(args, m)) { + return SIGTERM; + } + + return 0; +} +/* enter warning mode for MemAvailable < MemTotal*10% && MemAvailable < 6.4G + * + * + */ +static int mem_status(poll_loop_args_t* args, const meminfo_t* m) +{ + int mode = args->mode; + if ((m->MemAvailableKiB < WARN_KSIZE) && (m->MemAvailableKiB <= (m->MemTotalKiB * WARN_RATE))) { + mode = WARN; + } + if ((m->MemAvailableKiB > NOR_KSIZE) || ((m->MemAvailableKiB > m->MemTotalKiB * NOR_RATE))) { + mode = NORMAL; + } + return mode; +} + +void print_iowait(poll_loop_args_t *poll) +{ + warn("iowait: %.2f%% iowaitavg10: %2.f%% iowaitavg30: %2.f%% iowaitavg60: %2.f%%\n", \ + poll->cstat_util.iowait, poll->cstat_util.iowait_avg10, \ + poll->cstat_util.iowait_avg30, poll->cstat_util.iowait_avg60); +} + +void print_system(poll_loop_args_t *poll) +{ + warn("system: %.2f%% systemavg10: %2.f%% systemavg30: %2.f%% systemavg60: %2.f%%\n", \ + poll->cstat_util.system, poll->cstat_util.system_avg10, \ + poll->cstat_util.system_avg30, poll->cstat_util.system_avg60); +} + + +void print_killinfo(poll_loop_args_t *poll) +{ + meminfo_t m; + m = parse_meminfo(); + print_mem_stats(warn, m); + warn("min:%ld low: %ld high: %ld\n", poll->min, poll->low, poll->high); + print_iowait(poll); + print_system(poll); + warn("\n"); +} + + +// poll_loop is the main event loop. Never returns. +static void poll_loop(poll_loop_args_t* args) +{ + // Print a a memory report when this reaches zero. We start at zero so + // we print the first report immediately. + int report_countdown_ms = 0; + int report_prev_ms = args->report_interval_ms; + struct timeval start, end; + int ret = 0; + + printf("args->report_interval_ms:%d\n",args->report_interval_ms); + args->mode = NORMAL; + while (1) { + gettimeofday(&start, NULL); + meminfo_t m = parse_meminfo(); + int sig = 0; + args->m = m; + + get_cpu_stat(args); + mem_status(args, &m); + if ((args->mode != WARN) && (mem_status(args, &m)== WARN)) { + args->report_interval_ms = 1000; + args->mode = WARN; + warn("low Available memory entry warning mode \n"); + } else if ((args->mode != NORMAL) && (mem_status(args, &m)== NORMAL)) { + warn("normal Available memory entry normal mode :%d \n", args->mode); + args->report_interval_ms = report_prev_ms; + args->mode = NORMAL; + } + + sig = lowmem_sig(args, &m); + if (sig == SIGKILL) { + warn("low memory! at or below SIGKILL limits: mem " PRIPCT ", swap " PRIPCT "\n", + args->mem_kill_percent, args->swap_kill_percent); + //print_mem_stats(warn, m); + } else if (sig == SIGTERM) { + warn("low memory! at or below SIGTERM limits: mem " PRIPCT ", swap " PRIPCT "\n", + args->mem_term_percent, args->swap_term_percent); + //print_mem_stats(warn, m); + } + if (sig) { + procinfo_t victim = find_largest_process(args); + /* The run time of find_largest_process is proportional to the number + * of processes, and takes 2.5ms on my box with a running Gnome desktop (try "make bench"). + * This is long enough that the situation may have changed in the meantime, + * so we double-check if we still need to kill anything. + * The run time of parse_meminfo is only 6us on my box and independent of the number + * of processes (try "make bench"). + */ + m = parse_meminfo(); + args->m = m; + if (lowmem_sig(args, &m) == 0) { + warn("memory situation has recovered while selecting victim\n"); + } else { + kill_process(args, sig, &victim); + print_killinfo(args); + } + } else if (args->report_interval_ms && report_countdown_ms <= 0) { + print_mem_stats(warn, m); + print_iowait(args); + print_system(args); + if (args->mode == NORMAL) + report_countdown_ms = args->report_interval_ms * 6; + else + report_countdown_ms = args->report_interval_ms * 5; + } + gettimeofday(&end, NULL); + long sleep_ms = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000; + if (args->report_interval_ms > sleep_ms) + sleep_ms = args->report_interval_ms - sleep_ms; + else + sleep_ms = args->report_interval_ms; + + struct timespec req = { .tv_sec = (time_t)(sleep_ms / 1000), .tv_nsec = (sleep_ms % 1000) * 1000000 }; + if (args->mode == WARN) + nanosleep(&req, &req); + else { + ret = event_poll(args, sleep_ms); + if (ret > 0) { + args->mode = WARN; + args->report_interval_ms = 1000; + warn("event poll set to waring mode\n"); + } + } + report_countdown_ms -= (int)sleep_ms; + } +} diff --git a/source/tools/monitor/oomkill/meminfo.c b/source/tools/monitor/oomkill/meminfo.c new file mode 100644 index 0000000000000000000000000000000000000000..4f02e886208b9cc47a9d896b708b578d86fcabf6 --- /dev/null +++ b/source/tools/monitor/oomkill/meminfo.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: MIT + +/* Parse /proc/meminfo + * Returned values are in kiB */ + +#include +#include +#include // for size_t +#include +#include +#include +#include +#include + +#include "globals.h" +#include "meminfo.h" +#include "msg.h" + +/* Parse the contents of /proc/meminfo (in buf), return value of "name" + * (example: "MemTotal:") + * Returns -errno if the entry cannot be found. */ +static long long get_entry(const char* name, const char* buf) +{ + char* hit = strstr(buf, name); + if (hit == NULL) { + return -ENODATA; + } + + errno = 0; + long long val = strtoll(hit + strlen(name), NULL, 10); + if (errno != 0) { + int strtoll_errno = errno; + warn("%s: strtol() failed: %s", __func__, strerror(errno)); + return -strtoll_errno; + } + return val; +} + +/* Like get_entry(), but exit if the value cannot be found */ +static long long get_entry_fatal(const char* name, const char* buf) +{ + long long val = get_entry(name, buf); + if (val < 0) { + warn("%s: fatal error, dumping buffer for later diagnosis:\n%s", __func__, buf); + fatal(104, "could not find entry '%s' in /proc/meminfo: %s\n", name, strerror((int)-val)); + } + return val; +} + +/* If the kernel does not provide MemAvailable (introduced in Linux 3.14), + * approximate it using other data we can get */ +static long long available_guesstimate(const char* buf) +{ + long long Cached = get_entry_fatal("Cached:", buf); + long long MemFree = get_entry_fatal("MemFree:", buf); + long long Buffers = get_entry_fatal("Buffers:", buf); + long long Shmem = get_entry_fatal("Shmem:", buf); + + return MemFree + Cached + Buffers - Shmem; +} + +/* Parse /proc/meminfo. + * This function either returns valid data or kills the process + * with a fatal error. + */ +meminfo_t parse_meminfo() +{ + // Note that we do not need to close static FDs that we ensure to + // `fopen()` maximally once. + static FILE* fd; + static int guesstimate_warned = 0; + // On Linux 5.3, "wc -c /proc/meminfo" counts 1391 bytes. + // 8192 should be enough for the foreseeable future. + char buf[8192] = { 0 }; + meminfo_t m = { 0 }; + + if (fd == NULL) { + char buf[PATH_LEN] = { 0 }; + snprintf(buf, sizeof(buf), "%s/%s", procdir_path, "meminfo"); + fd = fopen(buf, "r"); + } + if (fd == NULL) { + fatal(102, "could not open /proc/meminfo: %s\n", strerror(errno)); + } + rewind(fd); + + size_t len = fread(buf, 1, sizeof(buf) - 1, fd); + if (ferror(fd)) { + fatal(103, "could not read /proc/meminfo: %s\n", strerror(errno)); + } + if (len == 0) { + fatal(103, "could not read /proc/meminfo: 0 bytes returned\n"); + } + + m.MemTotalKiB = get_entry_fatal("MemTotal:", buf); + m.MemFreeKiB = get_entry_fatal("MemFree:", buf); + m.SwapTotalKiB = get_entry_fatal("SwapTotal:", buf); + m.AnonPagesKiB = get_entry_fatal("AnonPages:", buf); + m.SwapFreeKiB = get_entry_fatal("SwapFree:", buf); + m.MemFileCacheKiB = get_entry_fatal("Active(file):", buf); + m.MemFileCacheKiB += get_entry_fatal("Inactive(file):", buf); + + m.MemAvailableKiB = get_entry("MemAvailable:", buf); + if (m.MemAvailableKiB < 0) { + m.MemAvailableKiB = available_guesstimate(buf); + if (guesstimate_warned == 0) { + fprintf(stderr, "Warning: Your kernel does not provide MemAvailable data (needs 3.14+)\n" + " Falling back to guesstimate\n"); + guesstimate_warned = 1; + } + } + + // Calculated values + m.UserMemTotalKiB = m.MemAvailableKiB + m.AnonPagesKiB; + + // Calculate percentages + m.MemAvailablePercent = (double)m.MemAvailableKiB * 100 / (double)m.UserMemTotalKiB; + if (m.SwapTotalKiB > 0) { + m.SwapFreePercent = (double)m.SwapFreeKiB * 100 / (double)m.SwapTotalKiB; + } else { + m.SwapFreePercent = 0; + } + + return m; +} + +bool is_alive(int pid) +{ + // whole process group (-g flag)? + if (pid < 0) { + // signal 0 does nothing, but we do get an error when the process + // group does not exist. + int res = kill(pid, 0); + if (res == 0) { + return true; + } + return false; + } + + char buf[PATH_LEN] = { 0 }; + // Read /proc/[pid]/stat + snprintf(buf, sizeof(buf), "%s/%d/stat", procdir_path, pid); + FILE* f = fopen(buf, "r"); + if (f == NULL) { + // Process is gone - good. + return false; + } + + // File content looks like this: + // 10751 (cat) R 2663 10751 2663[...] + // File may be bigger than 256 bytes, but we only need the first 20 or so. + memset(buf, 0, sizeof(buf)); + size_t len = fread(buf, 1, sizeof(buf), f); + bool read_error = ferror(f) || len == 0; + fclose(f); + if (read_error) { + warn("%s: fread failed: %s\n", __func__, strerror(errno)); + return false; + } + + // Find last ")" by searching from the end + int i = sizeof(buf) - 1; + for (; i >= 0; i--) { + if (buf[i] == ')') + break; + } + if (i <= 0 || i + 2 >= (int)sizeof(buf)) { + warn("%s: could not find closing bracket\n", __func__); + return false; + } + char state = buf[i + 2]; + + debug("process state: %c\n", state); + if (state == 'Z') { + // A zombie process does not use any memory. Consider it dead. + return false; + } + return true; +} + +/* Read /proc/[pid]/[name] and convert to integer. + * As the value may legitimately be < 0 (think oom_score_adj), + * it is stored in the `out` pointer, and the return value is either + * 0 (success) or -errno (failure). + */ +static int read_proc_file_integer(const int pid, const char* name, int* out) +{ + char path[PATH_LEN] = { 0 }; + snprintf(path, sizeof(path), "%s/%d/%s", procdir_path, pid, name); + FILE* f = fopen(path, "r"); + if (f == NULL) { + return -errno; + } + int matches = fscanf(f, "%d", out); + fclose(f); + if (matches != 1) { + return -ENODATA; + } + return 0; +} + +/* Read /proc/[pid]/oom_score. + * Returns the value (>= 0) or -errno on error. + */ +int get_oom_score(const int pid) +{ + int out = 0; + int res = read_proc_file_integer(pid, "oom_score", &out); + if (res < 0) { + return res; + } + return out; +} + +/* Read /proc/[pid]/oom_score_adj. + * As the value may legitimately be negative, the return value is + * only used for error indication, and the value is stored in + * the `out` pointer. + * Returns 0 on success and -errno on error. + */ +int get_oom_score_adj(const int pid, int* out) +{ + return read_proc_file_integer(pid, "oom_score_adj", out); +} + +/* Read /proc/[pid]/comm (process name truncated to 16 bytes). + * Returns 0 on success and -errno on error. + */ +int get_comm(int pid, char* out, size_t outlen) +{ + char path[PATH_LEN] = { 0 }; + snprintf(path, sizeof(path), "%s/%d/comm", procdir_path, pid); + FILE* f = fopen(path, "r"); + if (f == NULL) { + return -errno; + } + size_t n = fread(out, 1, outlen - 1, f); + if (ferror(f)) { + int fread_errno = errno; + perror("get_comm: fread() failed"); + fclose(f); + return -fread_errno; + } + fclose(f); + // Process name may be empty, but we should get at least a newline + // Example for empty process name: perl -MPOSIX -e '$0=""; pause' + if (n < 1) { + return -ENODATA; + } + // Strip trailing newline + out[n - 1] = 0; + fix_truncated_utf8(out); + return 0; +} + +// Get the effective uid (EUID) of `pid`. +// Returns the uid (>= 0) or -errno on error. +int get_uid(int pid) +{ + char path[PATH_LEN] = { 0 }; + snprintf(path, sizeof(path), "/proc/%d", pid); + struct stat st = { 0 }; + int res = stat(path, &st); + if (res < 0) { + return -errno; + } + return (int)st.st_uid; +} + +// Read VmRSS from /proc/[pid]/statm and convert to kiB. +// Returns the value (>= 0) or -errno on error. +long long get_vm_rss_kib(int pid) +{ + long long vm_rss_kib = -1; + char path[PATH_LEN] = { 0 }; + + // Read VmRSS from /proc/[pid]/statm (in pages) + snprintf(path, sizeof(path), "%s/%d/statm", procdir_path, pid); + FILE* f = fopen(path, "r"); + if (f == NULL) { + return -errno; + } + int matches = fscanf(f, "%*u %lld", &vm_rss_kib); + fclose(f); + if (matches < 1) { + return -ENODATA; + } + + // Read and cache page size + static long page_size; + if (page_size == 0) { + page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + fatal(1, "could not read page size\n"); + } + } + + // Convert to kiB + vm_rss_kib = vm_rss_kib * page_size / 1024; + return vm_rss_kib; +} + +/* Print a status line like + * mem avail: 5259 MiB (67 %), swap free: 0 MiB (0 %)" + * as an informational message to stdout (default), or + * as a warning to stderr. + */ +void print_mem_stats(int __attribute__((format(printf, 1, 2))) (*out_func)(const char* fmt, ...), const meminfo_t m) +{ + out_func("mem avail: %5lld of %5lld MiB (" PRIPCT "), filecache: %5lld MiB, free: %5lld MiB\n", + m.MemAvailableKiB / 1024, + m.UserMemTotalKiB / 1024, + m.MemAvailablePercent, + m.MemFileCacheKiB/1024, + m.MemFreeKiB/1024); +} diff --git a/source/tools/monitor/oomkill/meminfo.h b/source/tools/monitor/oomkill/meminfo.h new file mode 100644 index 0000000000000000000000000000000000000000..22df1ef5af066bc7063138c140fe49ceabe3a732 --- /dev/null +++ b/source/tools/monitor/oomkill/meminfo.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef MEMINFO_H +#define MEMINFO_H +#include +#include + +#define PATH_LEN 256 + +#define NOR_KSIZE (8*1024*1024) //8G + +#define WARN_KSIZE (6.4*1024*1024) //6.4G +#define KILL_KSIZE (5*1024*1024) //5G + +#define WARN_RATE (0.10) //10% +#define NOR_RATE (0.12) //12% + + +#define KILL_CACHE_KSIZE (3*1024*1024) +#define KILL_CACHE_RATE (0.05) //%5 + +/* + * NORMAL: MemAvailable > memToal*10% + * WARN: MemAvailable < memTotal*10% + * CRI: kswaped is active for memory reclaim + * ALERT: system entry direct memory reclaim + * EMER : system is block for direct memory reclaim(may oom kill) + * */ +typedef enum {NORMAL=1,WARN,CRIT,ALERT,EMER} memstatus; + +typedef struct { + // Values from /proc/meminfo, in KiB + long long MemTotalKiB; + long long MemFreeKiB; + long long MemFileCacheKiB; + long long MemAvailableKiB; + long long SwapTotalKiB; + long long SwapFreeKiB; + long long AnonPagesKiB; + // Calculated values + // UserMemTotalKiB = MemAvailableKiB + AnonPagesKiB. + // Represents the total amount of memory that may be used by user processes. + long long UserMemTotalKiB; + // Calculated percentages + double MemAvailablePercent; // percent of total memory that is available + double SwapFreePercent; // percent of total swap that is free +} meminfo_t; + +typedef struct procinfo { + int pid; + int uid; + long badness; + int oom_score_adj; + long long VmRSSkiB; + char name[PATH_LEN]; +} procinfo_t; + +meminfo_t parse_meminfo(); +bool is_alive(int pid); +void print_mem_stats(int (*out_func)(const char* fmt, ...), const meminfo_t m); +int get_oom_score(int pid); +int get_oom_score_adj(const int pid, int* out); +long long get_vm_rss_kib(int pid); +int get_comm(int pid, char* out, size_t outlen); +int get_uid(int pid); + +#endif diff --git a/source/tools/monitor/oomkill/metric.c b/source/tools/monitor/oomkill/metric.c new file mode 100644 index 0000000000000000000000000000000000000000..472fa3d8c89d7e81d02e093f41423a4fa5912539 --- /dev/null +++ b/source/tools/monitor/oomkill/metric.c @@ -0,0 +1,235 @@ +#include "meminfo.h" +#include "kill.h" +#include "metric.h" +#include +#include +#include +#include +#include +#define BUFSIZE 256 + + long get_watermark_scale_factor(void) +{ + int fd; + char path[] = "/proc/sys/vm/watermark_scale_factor"; + char buffer[32]; + long watermark_scale_factor = 0; + + fd = open(path, O_RDONLY); + if (fd < 0) { + return 0; + } + + if (read(fd, buffer, sizeof(buffer)) < 0) { + close(fd); + return 0; + } + + close(fd); + + watermark_scale_factor = atol(buffer); + + printf("watermark_scale_factor: %ld ", watermark_scale_factor); + + return watermark_scale_factor; +} + + long get_min_free(void) +{ + int fd; + char path[] = "/proc/sys/vm/min_free_kbytes"; + char buffer[32]; + long min_free_kbytes; + + fd = open(path, O_RDONLY); + if (fd < 0) { + printf("Failed to open file"); + return -1; + } + + if (read(fd, buffer, sizeof(buffer)) < 0) { + printf("Failed to read file"); + return -1; + } + + close(fd); + + min_free_kbytes = atol(buffer); + + printf("min_free_kbytes: %ld ", min_free_kbytes); + + return min_free_kbytes; +} + +int get_watermark(poll_loop_args_t *poll, meminfo_t *m) +{ + long min_free_kbyte; + long watermark_scale_factor; + long tmp = 0; + + min_free_kbyte = get_min_free(); + poll->min = min_free_kbyte; + + watermark_scale_factor = get_watermark_scale_factor(); + if (watermark_scale_factor) { + tmp = m->MemTotalKiB*watermark_scale_factor/10000; + } + + if ((min_free_kbyte>>2) > tmp) + tmp = min_free_kbyte>>2; + + poll->low = tmp + min_free_kbyte; + poll->high = tmp*2 + min_free_kbyte; + + printf("min:%ld low:%ld high:%ld\n", poll->min, poll->low, poll->high); + return 0; +} + + +float factor_x(float interval,float avg) +{ + return 1.0/expf(interval/avg); +} + +//load1 = load0 * e + active * (1 - e) +float avg_x(float curr, float prev, float factor) +{ + return prev*factor + curr*(1-factor); +} + +static void read_cpu_stat(struct cpu_stat *cs) { + FILE *fp = fopen("/proc/stat", "r"); + if (fp == NULL) { + printf("open /proc/stat error\n"); + exit(1); + } + fscanf(fp, "cpu %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld", + &cs->user, &cs->nice, &cs->system, &cs->idle, &cs->iowait, + &cs->irq, &cs->softirq, &cs->steal, &cs->guest, &cs->guest_nice); + fclose(fp); +} + + +static void diff_cpu_stat(struct cpu_stat *prev, struct cpu_stat *curr, struct cpu_stat *diff) { + diff->user = curr->user - prev->user; + diff->nice = curr->nice - prev->nice; + diff->system = curr->system - prev->system; + diff->idle = curr->idle - prev->idle; + diff->iowait = curr->iowait - prev->iowait; + diff->irq = curr->irq - prev->irq; + diff->softirq = curr->softirq - prev->softirq; + diff->steal = curr->steal - prev->steal; + diff->guest = curr->guest - prev->guest; + diff->guest_nice = curr->guest_nice - prev->guest_nice; +} + +static int calc_percent(struct cpu_stat *diff, poll_loop_args_t *poll) { + + int interval = poll->report_interval_ms/1000; + float curr = 0.0; + float prev_avg = 0.0; + + long total = diff->user + diff->nice + diff->system + diff->idle + diff->iowait + + diff->irq + diff->softirq + diff->steal + diff->guest + + diff->guest_nice; + if (total == 0) { + memset(&poll->cstat_util, 0, sizeof(poll->cstat_util)); + } + else { + poll->cstat_util.iowait = (float)diff->iowait / total * 100.0; + poll->cstat_util.user = (float)diff->user / total * 100.0; + poll->cstat_util.system = (float)diff->system / total * 100.0; + poll->cstat_util.idle = (float)diff->idle / total * 100.0; + /* for iowait avg */ + prev_avg = poll->cstat_util.iowait_avg10; + curr = poll->cstat_util.iowait; + poll->cstat_util.iowait_avg10 = avg_x(curr, prev_avg, factor_x(interval,10)); + + prev_avg = poll->cstat_util.iowait_avg30; + poll->cstat_util.iowait_avg30 = avg_x(curr, prev_avg, factor_x(interval,30)); + + prev_avg = poll->cstat_util.iowait_avg60; + poll->cstat_util.iowait_avg60 = avg_x(curr, prev_avg, factor_x(interval,60)); + + /* for system avg */ + prev_avg = poll->cstat_util.system_avg10; + curr = poll->cstat_util.system; + poll->cstat_util.system_avg10 = avg_x(curr, prev_avg, factor_x(interval,10)); + + prev_avg = poll->cstat_util.system_avg30; + poll->cstat_util.system_avg30 = avg_x(curr, prev_avg, factor_x(interval,30)); + + prev_avg = poll->cstat_util.system_avg60; + poll->cstat_util.system_avg60 = avg_x(curr, prev_avg, factor_x(interval,60)); + + } + return 0; +} + +int get_cpu_stat(poll_loop_args_t *poll) +{ + struct cpu_stat curr, diff; + + read_cpu_stat(&curr); + diff_cpu_stat(&poll->cstat_prev, &curr, &diff); + calc_percent(&diff, poll); + poll->cstat_prev = curr; + //printf("user: %.2f%% iowait: %.2f%%\n", poll->cstat_util.user,poll->cstat_util.iowait); + //printf("avg10: %.2f%% avg30: %.2f%% avg60:%.2f%% \n", poll->cstat_util.iowait_avg10,poll->cstat_util.iowait_avg30, poll->cstat_util.iowait_avg60); + return 0; +} + +int event_init(poll_loop_args_t *poll) +{ + char buf[BUFSIZE]; + int ret = 0; + poll->poll_fd = eventfd(0, 0); + + snprintf(buf, BUFSIZE, "%s", "/sys/fs/cgroup/memory/cgroup.event_control"); + poll->eventc_fd = open(buf, O_WRONLY); + + snprintf(buf, BUFSIZE, "%s","/sys/fs/cgroup/memory/memory.pressure_level"); + poll->pressure_fd = open(buf, O_RDONLY); + + snprintf(buf, BUFSIZE, "%d %d low", poll->poll_fd, poll->pressure_fd); + write(poll->eventc_fd, buf, strlen(buf)); + + poll->pfd.fd = poll->poll_fd; + poll->pfd.events = POLLIN; +} + +int event_poll(poll_loop_args_t *polls, int timeout) +{ + unsigned long u; + int ret = 0; + + ret = poll(&polls->pfd, 1, timeout); + if (ret > 0) { + read(polls->poll_fd, &u, sizeof(u)); + } + return ret; +} + +int event_uninit(poll_loop_args_t *poll) +{ + close(poll->poll_fd); + close(poll->eventc_fd); + close(poll->pressure_fd); + return 0; +} +int metric_init(poll_loop_args_t *poll) +{ + memset(&poll->cstat_util, 0, sizeof(poll->cstat_util)); + memset(&poll->cstat_prev, 0, sizeof(poll->cstat_prev)); + meminfo_t m = parse_meminfo(); + get_watermark(poll, &m); + event_init(poll); + + return 0; +} + +int metric_exit(poll_loop_args_t *poll) +{ + event_uninit(poll); + return 0; +} diff --git a/source/tools/monitor/oomkill/metric.h b/source/tools/monitor/oomkill/metric.h new file mode 100644 index 0000000000000000000000000000000000000000..51e401d2537ae08afc779c6dfe99da8443492ac1 --- /dev/null +++ b/source/tools/monitor/oomkill/metric.h @@ -0,0 +1,41 @@ +#ifndef __MISC__ +#define __MISC__ +#include +#include +#include +#include +#include + +struct cpu_util { + float user; + float nice; + float system; + float system_avg10; + float system_avg30; + float system_avg60; + float idle; + float iowait; + float iowait_avg10; + float iowait_avg30; + float iowait_avg60; + float irq; + float softirq; + float steal; + float guest; + float guest_nice; +}; + +struct cpu_stat { + long user; + long nice; + long system; + long idle; + long iowait; + long irq; + long softirq; + long steal; + long guest; + long guest_nice; +}; + +#endif diff --git a/source/tools/monitor/oomkill/msg.c b/source/tools/monitor/oomkill/msg.c new file mode 100644 index 0000000000000000000000000000000000000000..597dff3503fe33a3be656aca16c7dc08d9bcd442 --- /dev/null +++ b/source/tools/monitor/oomkill/msg.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include // need strlen() +#include + +#include "globals.h" +#include "msg.h" + +// color_log writes to `f`, prefixing the `color` code if `f` is a tty. +static void color_log(FILE* f, const char* color, const char* fmt, va_list vl) +{ + // Find out (and cache) if we should use color + static int stdout_is_tty = -1; + static int stderr_is_tty = -1; + static int no_color = -1; + bool is_tty = false; + + if (no_color == -1) { + // https://no-color.org/ + if (getenv("NO_COLOR") != NULL) { + no_color = 1; + } else { + no_color = 0; + } + } + if (no_color == 0) { + if (fileno(f) == fileno(stdout)) { + if (stdout_is_tty == -1) { + stdout_is_tty = isatty(fileno(stdout)); + } + is_tty = stdout_is_tty; + } else if (fileno(f) == fileno(stderr)) { + if (stderr_is_tty == -1) { + stderr_is_tty = isatty(fileno(stderr)); + } + is_tty = stderr_is_tty; + } + } + + // fds other than stdout and stderr never get color + const char* reset = "\033[0m"; + if (!is_tty) { + color = ""; + reset = ""; + } + + fputs(color, f); + vfprintf(f, fmt, vl); + fputs(reset, f); + // The `reset` control was not flushed out by the + // newline as it was sent after. Manually flush + // it now to prevent artifacts when stderr and stdout + // mix. + if (fmt[strlen(fmt) - 1] == '\n') { + fflush(f); + } +} + +// Print message, prefixed with "fatal: ", to stderr and exit with "code". +// Example: fatal(6, "could not compile regexp '%s'\n", regex_str); +int fatal(int code, char* fmt, ...) +{ + const char* red = "\033[31m"; + char fmt2[MSG_LEN] = { 0 }; + snprintf(fmt2, sizeof(fmt2), "fatal: %s", fmt); + va_list vl; + va_start(vl, fmt); + color_log(stderr, red, fmt2, vl); + va_end(vl); + exit(code); +} + +// Print a yellow warning message to stderr. No "warning" prefix is added. +int warn(const char* fmt, ...) +{ + const char* yellow = "\033[33m"; + va_list vl; + va_start(vl, fmt); + color_log(stderr, yellow, fmt, vl); + va_end(vl); + return 0; +} + +// Print a gray debug message to stdout. No prefix is added. +int debug(const char* fmt, ...) +{ + if (!enable_debug) { + return 0; + } + const char* gray = "\033[2m"; + va_list vl; + va_start(vl, fmt); + color_log(stdout, gray, fmt, vl); + va_end(vl); + return 0; +} + +// Parse a floating point value, check conversion errors and allowed range. +// Guaranteed value range: 0 <= val <= upper_limit. +// An error is indicated by storing an error message in tuple->err and returning 0. +static double parse_part(term_kill_tuple_t* tuple, const char* part, long long upper_limit) +{ + errno = 0; + char* endptr = 0; + double val = strtod(part, &endptr); + if (*endptr != '\0') { + snprintf(tuple->err, sizeof(tuple->err), + "trailing garbage '%s'", endptr); + return 0; + } + if (errno) { + snprintf(tuple->err, sizeof(tuple->err), + "conversion error: %s", strerror(errno)); + return 0; + } + if (val > (double)upper_limit) { + snprintf(tuple->err, sizeof(tuple->err), + "value %lf exceeds limit %lld", val, upper_limit); + return 0; + } + if (val < 0) { + snprintf(tuple->err, sizeof(tuple->err), + "value %lf below zero", val); + return 0; + } + return val; +} + +// Parse the "term[,kill]" tuple in optarg, examples: "123", "123,456". +// Guaranteed value range: 0 <= term <= kill <= upper_limit. +term_kill_tuple_t parse_term_kill_tuple(const char* optarg, long long upper_limit) +{ + term_kill_tuple_t tuple = { 0 }; + // writable copy of optarg + char buf[MSG_LEN] = { 0 }; + + if (strlen(optarg) > (sizeof(buf) - 1)) { + snprintf(tuple.err, sizeof(tuple.err), + "argument too long (%zu bytes)", strlen(optarg)); + return tuple; + } + strncpy(buf, optarg, sizeof(buf) - 1); + // Split string on "," into two parts + char* part1 = buf; + char* part2 = NULL; + char* comma = strchr(buf, ','); + if (comma) { + // Zero-out the comma, truncates part1 + *comma = '\0'; + // part2 gets zero or more bytes after the comma + part2 = comma + 1; + } + // Parse part1 + tuple.term = parse_part(&tuple, part1, upper_limit); + if (strlen(tuple.err)) { + return tuple; + } + if (part2) { + // Parse part2 + tuple.kill = parse_part(&tuple, part2, upper_limit); + if (strlen(tuple.err)) { + return tuple; + } + } else { + // User passed only the SIGTERM value: the SIGKILL value is calculated as + // SIGTERM/2. + tuple.kill = tuple.term / 2; + } + // Setting term < kill makes no sense + if (tuple.term < tuple.kill) { + warn("warning: SIGTERM value %.2lf is below SIGKILL value %.2lf, setting SIGTERM = SIGKILL = %.2lf\n", + tuple.term, tuple.kill, tuple.kill); + tuple.term = tuple.kill; + } + // Sanity checks + if (tuple.kill == 0 && tuple.term == 0) { + snprintf(tuple.err, sizeof(tuple.err), + "both SIGTERM and SIGKILL values are zero"); + return tuple; + } + return tuple; +} + +// Credit to https://gist.github.com/w-vi/67fe49106c62421992a2 +// Only works for string of length 3 and up. This is good enough +// for our use case, which is fixing the 16-byte value we get +// from /proc/[pid]/comm. +// +// Tested in unit_test.go: Test_fix_truncated_utf8() +void fix_truncated_utf8(char* str) +{ + size_t len = strlen(str); + if (len < 3) { + return; + } + // We only need to look at the last three bytes + char* b = str + len - 3; + // Last byte is ascii + if ((b[2] & 0x80) == 0) { + return; + } + // Last byte is multi-byte sequence start + if (b[2] & 0x40) { + b[2] = 0; + } + // Truncated 3-byte sequence + else if ((b[1] & 0xe0) == 0xe0) { + b[1] = 0; + // Truncated 4-byte sequence + } else if ((b[0] & 0xf0) == 0xf0) { + b[0] = 0; + } +} diff --git a/source/tools/monitor/oomkill/msg.h b/source/tools/monitor/oomkill/msg.h new file mode 100644 index 0000000000000000000000000000000000000000..6ece3d461f1daa3469c9492541cd24fe7fcf32eb --- /dev/null +++ b/source/tools/monitor/oomkill/msg.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef MSG_H +#define MSG_H + +#include + +#define MSG_LEN 256 + +// printf format for percentages +#define PRIPCT "%5.2lf%%" + +/* From https://gcc.gnu.org/onlinedocs/gcc-9.2.0/gcc/Common-Function-Attributes.html : + * The format attribute specifies that a function takes printf + * style arguments that should be type-checked against a format string. + */ +int fatal(int code, char* fmt, ...) __attribute__((noreturn, format(printf, 2, 3))); +int warn(const char* fmt, ...) __attribute__((format(printf, 1, 2))); +int debug(const char* fmt, ...) __attribute__((format(printf, 1, 2))); + +typedef struct { + // If the conversion failed, err contains the error message. + char err[255]; + // Parsed values. + double term; + double kill; +} term_kill_tuple_t; + +term_kill_tuple_t parse_term_kill_tuple(const char* optarg, long long upper_limit); +void fix_truncated_utf8(char* str); + +#endif