diff --git a/0025-proc_loadavg-fix-ABBA-deadlock-between-read-refresh.patch b/0025-proc_loadavg-fix-ABBA-deadlock-between-read-refresh.patch new file mode 100644 index 0000000000000000000000000000000000000000..796159116df046c8d264be50acd668e04f73aec1 --- /dev/null +++ b/0025-proc_loadavg-fix-ABBA-deadlock-between-read-refresh.patch @@ -0,0 +1,112 @@ +From 931666075fae8b533f0984073d54ce4adfb8e6d1 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Wed, 9 Aug 2023 18:39:46 +0200 +Subject: [PATCH 17/27] proc_loadavg: fix ABBA deadlock between read/refresh + +Idea of this fix is to always take nested locks in +the same order. + +At the same time, we adding an extra check to insert_node() +that prevents adding a new load_node with the same cgroup +(->cg field) value. This is theoretically possible because +we don't hold .rilock/.lock when we call insert_node(). + +It looks like we have this issue from the initial +implementation of loadavg virtualization and it's hardly +reproducible that's why we weren't able to notice it. + +Fixes: #605 +Signed-off-by: Alexander Mikhalitsyn + +Signed-off-by: wujichao +--- + src/proc_loadavg.c | 37 +++++++++++++++++++++++++++++++++++-- + 1 file changed, 35 insertions(+), 2 deletions(-) + +diff --git a/src/proc_loadavg.c b/src/proc_loadavg.c +index 8cd1e02..78457f8 100644 +--- a/src/proc_loadavg.c ++++ b/src/proc_loadavg.c +@@ -73,6 +73,11 @@ struct load_node { + }; + + struct load_head { ++ /* ++ * To prevent ABBA deadlocks, let's always take this locks in ++ * the order as they specified in this structure. ++ */ ++ + /* + * The lock is about insert load_node and refresh load_node.To the first + * load_node of each hash bucket, insert and refresh in this hash bucket is +@@ -108,8 +113,8 @@ static struct load_node *locate_node(char *cg, int locate) + struct load_node *f = NULL; + int i = 0; + +- pthread_rwlock_rdlock(&load_hash[locate].rilock); + pthread_rwlock_rdlock(&load_hash[locate].rdlock); ++ pthread_rwlock_rdlock(&load_hash[locate].rilock); + if (load_hash[locate].next == NULL) { + pthread_rwlock_unlock(&load_hash[locate].rilock); + return f; +@@ -121,11 +126,37 @@ static struct load_node *locate_node(char *cg, int locate) + return f; + } + ++/* ++ * Inserts a new load_node into the load_hash table, ++ * if an appropriate node exists then just free (*n) and ++ * rewrite (n) value to an existing load_node pointer. ++ * ++ * We should enter this function without any locks held. ++ * This function leaves &load_hash[hash].rdlock taken. ++ */ + static void insert_node(struct load_node **n, int locate) + { + struct load_node *f; + + pthread_mutex_lock(&load_hash[locate].lock); ++ ++ /* ++ * We have to recheck if the node we are looking for ++ * has appeared in the hash table. In this case we just free ++ * newly created load_node and give an existing load_node to use. ++ */ ++ f = locate_node((*n)->cg, locate); ++ if (f) { ++ free_disarm((*n)->cg); ++ free_disarm((*n)); ++ *n = f; ++ ++ pthread_mutex_unlock(&load_hash[locate].lock); ++ return; ++ } ++ ++ /* &load_hash[hash].rdlock is taken for read at this point */ ++ + pthread_rwlock_wrlock(&load_hash[locate].rilock); + f = load_hash[locate].next; + load_hash[locate].next = *n; +@@ -219,7 +250,9 @@ int proc_loadavg_read(char *buf, size_t size, off_t offset, + n->total_pid = 1; + n->last_pid = initpid; + n->cfd = cfd; ++ pthread_rwlock_unlock(&load_hash[hash].rdlock); + insert_node(&n, hash); ++ /* &load_hash[hash].rdlock is taken for reading at this point */ + } + a = n->avenrun[0] + (FIXED_1 / 200); + b = n->avenrun[1] + (FIXED_1 / 200); +@@ -570,8 +603,8 @@ static void load_free(void) + + for (int i = 0; i < LOAD_SIZE; i++) { + pthread_mutex_lock(&load_hash[i].lock); +- pthread_rwlock_wrlock(&load_hash[i].rilock); + pthread_rwlock_wrlock(&load_hash[i].rdlock); ++ pthread_rwlock_wrlock(&load_hash[i].rilock); + if (load_hash[i].next == NULL) { + pthread_mutex_unlock(&load_hash[i].lock); + pthread_mutex_destroy(&load_hash[i].lock); +-- +2.45.0 + diff --git a/0026-proc-Fix-proc-cpuinfo-not-respecting-personality.patch b/0026-proc-Fix-proc-cpuinfo-not-respecting-personality.patch new file mode 100644 index 0000000000000000000000000000000000000000..a22355a581c6d68178a6a2e12d219f57ba4dc5e9 --- /dev/null +++ b/0026-proc-Fix-proc-cpuinfo-not-respecting-personality.patch @@ -0,0 +1,96 @@ +From 52d1f78123728ede16846bc23457c9a0219b3131 Mon Sep 17 00:00:00 2001 +From: Mathias Gibbens +Date: Mon, 4 Sep 2023 00:13:57 +0000 +Subject: [PATCH 18/27] proc: Fix /proc/cpuinfo not respecting personality + +It was found that the personality within the container was not being +properly respected, which for large numbers of CPUs would break +reporting of /proc/cpuinfo in arm32 containers running on an arm64 host. + +Signed-off-by: Mathias Gibbens +Confict: +src/proc_fuse.c: adapt the /proc/partitions introduced by openEuler + +Signed-off-by: vegbir +Signed-off-by: wujichao +--- + src/proc_fuse.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 47 insertions(+), 2 deletions(-) + +diff --git a/src/proc_fuse.c b/src/proc_fuse.c +index ce22974..dd6eaf2 100644 +--- a/src/proc_fuse.c ++++ b/src/proc_fuse.c +@@ -102,6 +102,45 @@ static off_t get_procfile_size(const char *path) + return answer; + } + ++static off_t get_procfile_size_with_personality(const char *path) ++{ ++ struct fuse_context *fc = fuse_get_context(); ++ __u32 host_personality = liblxcfs_personality(), caller_personality; ++ bool change_personality; ++ int ret; ++ off_t procfile_size_ret; ++ ++ if (get_task_personality(fc->pid, &caller_personality) < 0) ++ return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid); ++ ++ /* do we need to change thread personality? */ ++ change_personality = host_personality != caller_personality; ++ ++ if (change_personality) { ++ ret = personality(caller_personality); ++ if (ret == -1) ++ return log_error(0, "Call to personality(%d) failed: %s\n", ++ caller_personality, strerror(errno)); ++ ++ lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n", ++ (int)syscall(SYS_gettid), ret, caller_personality); ++ } ++ ++ procfile_size_ret = get_procfile_size(path); ++ ++ if (change_personality) { ++ ret = personality(host_personality); ++ if (ret == -1) ++ return log_error(0, "Call to personality(%d) failed: %s\n", ++ host_personality, strerror(errno)); ++ ++ lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n", ++ (int)syscall(SYS_gettid), ret, host_personality); ++ } ++ ++ return procfile_size_ret; ++} ++ + __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb) + { + struct timespec now; +@@ -126,7 +165,10 @@ __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb) + strcmp(path, "/proc/swaps") == 0 || + strcmp(path, "/proc/loadavg") == 0 || + strcmp(path, "/proc/partitions") == 0) { +- sb->st_size = get_procfile_size(path); ++ if (liblxcfs_functional()) ++ sb->st_size = get_procfile_size_with_personality(path); ++ else ++ sb->st_size = get_procfile_size(path); + sb->st_mode = S_IFREG | 00444; + sb->st_nlink = 1; + return 0; +@@ -184,7 +226,10 @@ __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi) + + info->type = type; + +- info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE; ++ if (liblxcfs_functional()) ++ info->buflen = get_procfile_size_with_personality(path) + BUF_RESERVE_SIZE; ++ else ++ info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE; + + info->buf = zalloc(info->buflen); + if (!info->buf) +-- +2.33.0 + diff --git a/0027-proc_loadavg.c-Fix-incompatible-integer-to-pointer-c.patch b/0027-proc_loadavg.c-Fix-incompatible-integer-to-pointer-c.patch new file mode 100644 index 0000000000000000000000000000000000000000..e3c9bc4f305030720a8d766196a854599246b669 --- /dev/null +++ b/0027-proc_loadavg.c-Fix-incompatible-integer-to-pointer-c.patch @@ -0,0 +1,46 @@ +From 2757ab0591148b45cb9fdc67240855da7e58666d Mon Sep 17 00:00:00 2001 +From: Brahmajit Das +Date: Tue, 5 Sep 2023 04:15:06 +0000 +Subject: [PATCH 19/27] proc_loadavg.c: Fix incompatible integer to pointer + conversion + +Newer compiler like Clang 16 and GCC 14 have certain error enabled by +default, namely -Werror=incompatible-function-pointer-types. Which +resutls in build error such as: + +proc_loadavg.c:606:10: error: incompatible integer to pointer conversion returning int from a function with result type pthread_t + +My patch supresses the error for now, but a proper fix would be better. +Fist discovered on Gentoo linux (bug #894348). + +Bug: https://bugs.gentoo.org/894348 +Closes: https://github.com/lxc/lxcfs/issues/561 +Signed-off-by: Brahmajit Das + +Signed-off-by: wujichao +--- + src/proc_loadavg.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/proc_loadavg.c b/src/proc_loadavg.c +index 78457f8..71592bb 100644 +--- a/src/proc_loadavg.c ++++ b/src/proc_loadavg.c +@@ -639,12 +639,12 @@ pthread_t load_daemon(int load_use) + + ret = init_load(); + if (ret == -1) +- return log_error(0, "Initialize hash_table fails in load_daemon!"); ++ return (pthread_t)log_error(0, "Initialize hash_table fails in load_daemon!"); + + ret = pthread_create(&pid, NULL, load_begin, NULL); + if (ret != 0) { + load_free(); +- return log_error(0, "Create pthread fails in load_daemon!"); ++ return (pthread_t)log_error(0, "Create pthread fails in load_daemon!"); + } + + /* use loadavg, here loadavg = 1*/ +-- +2.45.0 + diff --git a/0028-proc-fix-MemAvailable-in-proc-meminfo-to-exclude-tmp.patch b/0028-proc-fix-MemAvailable-in-proc-meminfo-to-exclude-tmp.patch new file mode 100644 index 0000000000000000000000000000000000000000..7a47e794c03279abedd1acfffd5386c1ecb1ff3e --- /dev/null +++ b/0028-proc-fix-MemAvailable-in-proc-meminfo-to-exclude-tmp.patch @@ -0,0 +1,41 @@ +From c8ec4e00abaa7c0bf31d1642f9f274a567a69c68 Mon Sep 17 00:00:00 2001 +From: Kyeong Yoo +Date: Tue, 3 Oct 2023 16:36:51 +1300 +Subject: [PATCH 21/27] proc: fix MemAvailable in /proc/meminfo to exclude + tmpfs files + +The "total_cache" from memory.stat of cgroup includes +the memory used by tmpfs files ("total_shmem"). Considering +it as available memory is wrong because files created +on a tmpfs file system cannot be simply reclaimed. + +So the available memory is calculated with the sum of: + * Memory the kernel knows is free + * Memory that contained in the kernel active file LRU, + that can be reclaimed if necessary + * Memory that is contained in the kernel non-active file + LRU, that can be reclaimed if necessary + +Signed-off-by: Kyeong Yoo + +Signed-off-by: wujichao +--- + src/proc_fuse.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/proc_fuse.c b/src/proc_fuse.c +index dd6eaf2..0d65df5 100644 +--- a/src/proc_fuse.c ++++ b/src/proc_fuse.c +@@ -1667,7 +1667,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, + snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage); + printme = lbuf; + } else if (startswith(line, "MemAvailable:")) { +- snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + mstat.total_cache / 1024); ++ snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + (mstat.total_active_file + mstat.total_inactive_file) / 1024); + printme = lbuf; + } else if (startswith(line, "SwapTotal:")) { + if (wants_swap) { +-- +2.33.0 + diff --git a/0029-proc-Fix-swap-handling-for-cgroups-v2-can_use_swap.patch b/0029-proc-Fix-swap-handling-for-cgroups-v2-can_use_swap.patch new file mode 100644 index 0000000000000000000000000000000000000000..de1840b410e798bf592ddf8075bc94b19bedd9c4 --- /dev/null +++ b/0029-proc-Fix-swap-handling-for-cgroups-v2-can_use_swap.patch @@ -0,0 +1,140 @@ +From 2442b37c838d1702012cdc6792ffb46d45a78526 Mon Sep 17 00:00:00 2001 +From: Alex Hudspith +Date: Mon, 6 Nov 2023 09:17:38 +0000 +Subject: [PATCH 23/27] proc: Fix swap handling for cgroups v2 (can_use_swap) + +On cgroups v2, there are no swap current/max files at the cgroup root, so +can_use_swap must look lower in the hierarchy to determine if swap accounting +is enabled. To also account for memory accounting being turned off at some +level, walk the hierarchy upwards from lxcfs' own cgroup. + +Signed-off-by: Alex Hudspith +[ added check cgroup pointer is not NULL in lxcfs_init() ] +Signed-off-by: Alexander Mikhalitsyn + +Signed-off-by: wujichao +--- + src/bindings.c | 4 +++- + src/cgroups/cgfsng.c | 33 ++++++++++++--------------------- + src/cgroups/cgroup.h | 2 +- + src/proc_fuse.c | 8 +++++--- + 4 files changed, 21 insertions(+), 26 deletions(-) + +diff --git a/src/bindings.c b/src/bindings.c +index eb62ddb..34989e3 100644 +--- a/src/bindings.c ++++ b/src/bindings.c +@@ -835,6 +835,7 @@ static void __attribute__((constructor)) lxcfs_init(void) + { + __do_close int init_ns = -EBADF, root_fd = -EBADF, + pidfd = -EBADF; ++ __do_free char *cgroup = NULL; + int i = 0; + pid_t pid; + +@@ -888,7 +889,8 @@ static void __attribute__((constructor)) lxcfs_init(void) + lxcfs_info("Kernel supports pidfds"); + } + +- can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); ++ cgroup = get_pid_cgroup(pid, "memory"); ++ can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup); + if (can_use_swap) + lxcfs_info("Kernel supports swap accounting"); + else +diff --git a/src/cgroups/cgfsng.c b/src/cgroups/cgfsng.c +index e50c87d..a19089c 100644 +--- a/src/cgroups/cgfsng.c ++++ b/src/cgroups/cgfsng.c +@@ -616,34 +616,25 @@ static int cgfsng_get_memory_swap_max(struct cgroup_ops *ops, + return cgfsng_get_memory(ops, cgroup, "memory.swap.max", value); + } + +-static bool cgfsng_can_use_swap(struct cgroup_ops *ops) ++static bool cgfsng_can_use_swap(struct cgroup_ops *ops, const char *cgroup) + { +- bool has_swap = false; ++ __do_free char *cgroup_rel = NULL, *junk_value = NULL; ++ const char *file; + struct hierarchy *h; + + h = ops->get_hierarchy(ops, "memory"); + if (!h) + return false; + +- if (is_unified_hierarchy(h)) { +- if (faccessat(h->fd, "memory.swap.max", F_OK, 0)) +- return false; +- +- if (faccessat(h->fd, "memory.swap.current", F_OK, 0)) +- return false; +- +- has_swap = true; +- } else { +- if (faccessat(h->fd, "memory.memsw.limit_in_bytes", F_OK, 0)) +- return false; +- +- if (faccessat(h->fd, "memory.memsw.usage_in_bytes", F_OK, 0)) +- return false; +- +- has_swap = true; +- } +- +- return has_swap; ++ cgroup_rel = must_make_path_relative(cgroup, NULL); ++ file = is_unified_hierarchy(h) ? "memory.swap.current" : "memory.memsw.usage_in_bytes"; ++ /* For v2, we need to look at the lower levels of the hierarchy because ++ * no 'memory.swap.current' file exists at the root. We must search ++ * upwards in the hierarchy in case memory accounting is disabled via ++ * cgroup.subtree_control for the given cgroup itself. ++ */ ++ int ret = cgroup_walkup_to_root(ops->cgroup2_root_fd, h->fd, cgroup_rel, file, &junk_value); ++ return ret == 0; + } + + static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup, +diff --git a/src/cgroups/cgroup.h b/src/cgroups/cgroup.h +index 2c8cca3..d47418b 100644 +--- a/src/cgroups/cgroup.h ++++ b/src/cgroups/cgroup.h +@@ -146,7 +146,7 @@ struct cgroup_ops { + char **value); + int (*get_memory_swap_max)(struct cgroup_ops *ops, const char *cgroup, + char **value); +- bool (*can_use_swap)(struct cgroup_ops *ops); ++ bool (*can_use_swap)(struct cgroup_ops *ops, const char *cgroup); + + /* cpuset */ + int (*get_cpuset_cpus)(struct cgroup_ops *ops, const char *cgroup, +diff --git a/src/proc_fuse.c b/src/proc_fuse.c +index 0d65df5..cf9011d 100644 +--- a/src/proc_fuse.c ++++ b/src/proc_fuse.c +@@ -463,11 +463,13 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, + } + + if (wants_swap) { +- /* The total amount of swap is always reported to be the ++ /* For cgroups v1, the total amount of swap is always reported to be the + lesser of the RAM+SWAP limit or the SWAP device size. + This is because the kernel can swap as much as it + wants and not only up to swtotal. */ +- swtotal = memlimit / 1024 + swtotal; ++ if (!liblxcfs_memory_is_cgroupv2()) ++ swtotal = memlimit / 1024 + swtotal; ++ + if (hostswtotal < swtotal) { + swtotal = hostswtotal; + } +@@ -1675,7 +1677,7 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, + + sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal); + +- /* The total amount of swap is always reported to be the ++ /* In cgroups v1, the total amount of swap is always reported to be the + lesser of the RAM+SWAP limit or the SWAP device size. + This is because the kernel can swap as much as it + wants and not only up to swtotal. */ +-- +2.33.0 + diff --git a/0030-proc-Fix-swap-handling-for-cgroups-v2-zero-limits.patch b/0030-proc-Fix-swap-handling-for-cgroups-v2-zero-limits.patch new file mode 100644 index 0000000000000000000000000000000000000000..5864cab3e6fda3b1332724d9fe504af821b86366 --- /dev/null +++ b/0030-proc-Fix-swap-handling-for-cgroups-v2-zero-limits.patch @@ -0,0 +1,178 @@ +From af9133e3294df47ecff75a2d071cb7d33413bc95 Mon Sep 17 00:00:00 2001 +From: Alex Hudspith +Date: Mon, 6 Nov 2023 09:17:38 +0000 +Subject: [PATCH 24/27] proc: Fix swap handling for cgroups v2 (zero limits) + +Since memory.swap.max = 0 is valid under v2, limits of 0 must not be +treated differently. Instead, use UINT64_MAX as the default limit. This aligns +with cgroups v1 behaviour anyway since 'limit_in_bytes' files contain a large +number for unspecified limits (2^63). + +Resolves: #534 +Signed-off-by: Alex Hudspith +conflict: src/proc_fuse.c +Line numbers don't match +Some original logic is inconsistent +Hunk #2 #4 #5 failed +Signed-off-by: vegbir + +conflict: src/proc_fuse.c +note: This patch has been adapted accordingly in the lxcfs 4.0.11 version +Signed-off-by: wujichao +--- + src/proc_fuse.c | 79 ++++++++++++++++++++++++++++++++----------------- + 1 file changed, 52 insertions(+), 27 deletions(-) + +diff --git a/src/proc_fuse.c b/src/proc_fuse.c +index cf9011d..8beb64c 100644 +--- a/src/proc_fuse.c ++++ b/src/proc_fuse.c +@@ -259,20 +259,36 @@ __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi) + return 0; + } + +-static uint64_t get_memlimit(const char *cgroup, bool swap) ++/** ++ * Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is ++ * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise ++ * reads the memory (RAM) limits. ++ * ++ * @returns 0 on success (and sets `*limit`), < 0 on error ++ */ ++static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit) + { + __do_free char *memlimit_str = NULL; +- uint64_t memlimit = 0; ++ uint64_t memlimit = UINT64_MAX; + int ret; + + if (swap) + ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str); + else + ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str); +- if (ret > 0 && memlimit_str[0] && safe_uint64(memlimit_str, &memlimit, 10) < 0) +- lxcfs_error("Failed to convert memlimit %s", memlimit_str); +- +- return memlimit; ++ if (ret < 0) ++ return ret; ++ ++ if (memlimit_str[0]) { ++ ret = safe_uint64(memlimit_str, &memlimit, 10); ++ if (ret < 0) { ++ lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s", ++ swap ? ".swap" : "", memlimit_str, cgroup); ++ return ret; ++ } ++ } ++ *limit = memlimit; ++ return 0; + } + + /* +@@ -337,33 +353,44 @@ static char *gnu_dirname(char *path) + return path; + } + +-static uint64_t get_min_memlimit(const char *cgroup, bool swap) ++/** ++ * Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is ++ * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise ++ * reads the memory (RAM) limits. ++ * ++ * @returns 0 on success (and sets `*limit`), < 0 on error ++ */ ++static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit) + { + __do_free char *copy = NULL; +- uint64_t memlimit = 0, retlimit = 0; ++ uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX; ++ int ret; + + copy = strdup(cgroup); + if (!copy) + return log_error_errno(0, ENOMEM, "Failed to allocate memory"); + +- retlimit = get_memlimit(copy, swap); +- if (retlimit == 0) +- return 0; ++ ret = get_memlimit(copy, swap, &retlimit); ++ if (ret < 0) ++ return ret; + + /* + * If the cgroup doesn't start with / (probably won't happen), dirname() + * will terminate with "" instead of "/" + */ +- while (*copy && strcmp(copy, "/") != 0) { ++ while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) { + char *it = copy; + + it = gnu_dirname(it); +- memlimit = get_memlimit(it, swap); +- if (memlimit > 0 && memlimit < retlimit) ++ ret = get_memlimit(it, swap, &memlimit); ++ if (ret < 0) ++ return ret; ++ if (memlimit < retlimit) + retlimit = memlimit; +- }; ++ } + +- return retlimit; ++ *limit = retlimit; ++ return 0; + } + + static inline bool startswith(const char *line, const char *pref) +@@ -416,20 +443,18 @@ static int proc_swaps_read(char *buf, size_t size, off_t offset, + return read_file_fuse("/proc/swaps", buf, size, d); + prune_init_slice(cgroup); + +- memlimit = get_min_memlimit(cgroup, false); +- if (memlimit == 0) ++ ret = get_min_memlimit(cgroup, false, &memlimit); ++ if (ret < 0) + return 0; +- + ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str); + if (ret < 0) + return 0; +- + if (safe_uint64(memusage_str, &memusage, 10) < 0) + lxcfs_error("Failed to convert memusage %s", memusage_str); + + if (wants_swap) { +- memswlimit = get_min_memlimit(cgroup, true); +- if (memswlimit > 0) { ++ ret = get_min_memlimit(cgroup, true, &memswlimit); ++ if (ret >= 0) { + ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str); + if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) { + if (memlimit > memswlimit) +@@ -1616,17 +1641,17 @@ static int proc_meminfo_read(char *buf, size_t size, off_t offset, + if (!cgroup_parse_memory_stat(cgroup, &mstat)) + return read_file_fuse("/proc/meminfo", buf, size, d); + +- memlimit = get_min_memlimit(cgroup, false); +- if (memlimit == 0) +- return 0; ++ ret = get_min_memlimit(cgroup, false, &memlimit); ++ if (ret < 0) ++ return read_file_fuse("/proc/meminfo", buf, size, d); + + /* + * Following values are allowed to fail, because swapaccount might be + * turned off for current kernel. + */ + if (wants_swap) { +- memswlimit = get_min_memlimit(cgroup, true); +- if (memswlimit > 0) { ++ ret = get_min_memlimit(cgroup, true, &memswlimit); ++ if (ret >= 0) { + ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str); + if (ret >= 0 && safe_uint64(memswusage_str, &memswusage, 10) == 0) { + if (memlimit > memswlimit) +-- +2.33.0 + diff --git a/0031-proc-checks-system-security-policy-before-trying-to-.patch b/0031-proc-checks-system-security-policy-before-trying-to-.patch new file mode 100644 index 0000000000000000000000000000000000000000..6b3eace06f950896188e846530ffd3cfadd25950 --- /dev/null +++ b/0031-proc-checks-system-security-policy-before-trying-to-.patch @@ -0,0 +1,114 @@ +From c39b7a5934adaffe522b772048347ae82c797add Mon Sep 17 00:00:00 2001 +From: Samuel FORESTIER +Date: Sun, 28 Apr 2024 11:27:51 +0200 +Subject: [PATCH 25/27] proc: checks system security policy before trying to + get personalities + +096972f7 and fc8f593b introduces task personalities retrieval to fix +incorrect /proc files info in some cases. +Linux governs access to personalities based on system ptrace policy, +which may be restricted by an LSM (e.g. Yama). + +This patch implements a simple check for init's personality access to +make sure ptrace usage is allowed, and prevent access from containers to +proc files with "Permission denied" error if not. + +> closes #636 (follow-up to #553 and #609). + +Signed-off-by: Samuel FORESTIER +conflict: src/proc_fuse.c +The number of lines does not match because lxcfs supports /proc/partitions +Signed-off-by: vegbir + +conflict: src/proc_fuse.c and src/utils.c +Signed-off-by: wujichao +--- + src/proc_fuse.c | 10 ++++++++-- + src/utils.c | 22 ++++++++++++++++++++++ + src/utils.h | 3 +++ + 3 files changed, 33 insertions(+), 2 deletions(-) + +diff --git a/src/proc_fuse.c b/src/proc_fuse.c +index 8beb64c..8fd5f32 100644 +--- a/src/proc_fuse.c ++++ b/src/proc_fuse.c +@@ -165,8 +165,11 @@ __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb) + strcmp(path, "/proc/swaps") == 0 || + strcmp(path, "/proc/loadavg") == 0 || + strcmp(path, "/proc/partitions") == 0) { +- if (liblxcfs_functional()) ++ if (liblxcfs_functional()) { ++ if (!can_access_personality()) ++ return log_error(-EACCES, RESTRICTED_PERSONALITY_ACCESS_POLICY); + sb->st_size = get_procfile_size_with_personality(path); ++ } + else + sb->st_size = get_procfile_size(path); + sb->st_mode = S_IFREG | 00444; +@@ -226,8 +229,11 @@ __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi) + + info->type = type; + +- if (liblxcfs_functional()) ++ if (liblxcfs_functional()) { ++ if (!can_access_personality()) ++ return log_error(-EACCES, RESTRICTED_PERSONALITY_ACCESS_POLICY); + info->buflen = get_procfile_size_with_personality(path) + BUF_RESERVE_SIZE; ++ } + else + info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE; + +diff --git a/src/utils.c b/src/utils.c +index b826b2e..35af806 100644 +--- a/src/utils.c ++++ b/src/utils.c +@@ -575,3 +575,25 @@ char *trim_whitespace_in_place(char *buffer) + buffer[char_right_gc(buffer, strlen(buffer))] = '\0'; + return buffer; + } ++ ++/* ++ This function checks whether system security policy (i.e. Yama LSM) allows personality access, by trying on ++ init own one. ++ This is required as it may be restricted by a ptrace access mode check (see PROC(5)), and ++ `get_task_personality` function relies on this. ++*/ ++bool can_access_personality(void) ++{ ++ static int could_access_init_personality = -1; ++ ++ /* init personality has never been accessed (cache is empty) */ ++ if (could_access_init_personality == -1) { ++ if (get_task_personality(1, NULL) < 0) { ++ could_access_init_personality = 0; ++ } else { ++ could_access_init_personality = 1; ++ } ++ } ++ ++ return could_access_init_personality != 0; ++} +\ No newline at end of file +diff --git a/src/utils.h b/src/utils.h +index ab2da0f..8dd4d06 100644 +--- a/src/utils.h ++++ b/src/utils.h +@@ -25,6 +25,8 @@ + #define SEND_CREDS_NOTSK 1 + #define SEND_CREDS_FAIL 2 + ++#define RESTRICTED_PERSONALITY_ACCESS_POLICY "Due to restricted personality access policy, reading proc files from containers is not permitted" ++ + struct file_info; + + __attribute__((__format__(__printf__, 4, 5))) extern char *must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...); +@@ -68,5 +70,6 @@ extern FILE *fdopen_cached(int fd, const char *mode, void **caller_freed_buffer) + extern ssize_t write_nointr(int fd, const void *buf, size_t count); + extern int safe_uint64(const char *numstr, uint64_t *converted, int base); + extern char *trim_whitespace_in_place(char *buffer); ++extern bool can_access_personality(void); + + #endif /* __LXCFS_UTILS_H */ +-- +2.33.0 + diff --git a/0032-utils-fix-wait_for_sock-to-use-time_t-instead-of-int.patch b/0032-utils-fix-wait_for_sock-to-use-time_t-instead-of-int.patch new file mode 100644 index 0000000000000000000000000000000000000000..eac652f1f44316358476b451b1fe85c7c6c45d4e --- /dev/null +++ b/0032-utils-fix-wait_for_sock-to-use-time_t-instead-of-int.patch @@ -0,0 +1,30 @@ +From 057deedf2081c75add5b81cec3267554a8bb564d Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 31 Jan 2025 10:29:36 +0100 +Subject: [PATCH 26/27] utils: fix wait_for_sock to use time_t instead of int + +Fixes: Coverity 382186 +Signed-off-by: Alexander Mikhalitsyn + +Signed-off-by: wujichao +--- + src/utils.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/utils.c b/src/utils.c +index 35af806..7a85511 100644 +--- a/src/utils.c ++++ b/src/utils.c +@@ -172,7 +172,8 @@ bool wait_for_sock(int sock, int timeout) + { + __do_close int epfd = -EBADF; + struct epoll_event ev; +- int ret, now, starttime, deltatime; ++ int ret; ++ time_t now, starttime, deltatime; + + if ((starttime = time(NULL)) < 0) + return false; +-- +2.33.0 + diff --git a/0033-cpuset_parse-make-a-check-for-an-empty-string-in-cpu.patch b/0033-cpuset_parse-make-a-check-for-an-empty-string-in-cpu.patch new file mode 100644 index 0000000000000000000000000000000000000000..b797074ca685e6dbf405a21380adb438f9ff4dad --- /dev/null +++ b/0033-cpuset_parse-make-a-check-for-an-empty-string-in-cpu.patch @@ -0,0 +1,31 @@ +From a8886fd35f0bfbe36e7acaf4276699d09a72fe31 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 31 Jan 2025 10:44:22 +0100 +Subject: [PATCH 27/27] cpuset_parse: make a check for an empty string in + cpu_in_cpuset() + +Fixes: Coverity 382195 +Signed-off-by: Alexander Mikhalitsyn + +Signed-off-by: wujichao +--- + src/cpuset_parse.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/cpuset_parse.c b/src/cpuset_parse.c +index e511bb0..bb16ac5 100644 +--- a/src/cpuset_parse.c ++++ b/src/cpuset_parse.c +@@ -36,6 +36,9 @@ static int cpuset_getrange(const char *c, int *a, int *b) + */ + bool cpu_in_cpuset(int cpu, const char *cpuset) + { ++ if (!strlen(cpuset)) ++ return false; ++ + for (const char *c = cpuset; c; c = cpuset_nexttok(c)) { + int a, b, ret; + +-- +2.33.0 + diff --git a/lxcfs.spec b/lxcfs.spec index 57736745e650d76cee67b831670b49b81c4c2a33..2cc802cd733549862f88bb06c593334d76c9039d 100644 --- a/lxcfs.spec +++ b/lxcfs.spec @@ -4,7 +4,7 @@ #Basic Information Name: lxcfs Version: 4.0.11 -Release: 9 +Release: 10 Summary: FUSE filesystem for LXC License: LGPL 2.1+ URL: http://linuxcontainers.org @@ -39,6 +39,15 @@ Patch9021: 0021-fix-pidfd_open-pidfd_send_signal-function-compilatio.patch Patch9022: 0022-cpuview-fix-possible-use-after-free-in-find_proc_sta.patch Patch9023: 0023-proc-fix-proc-diskstats-output-format.patch Patch9024: 0024-typofix-fix-incorrect-printing-in-lxcfs-help-interfa.patch +Patch9025: 0025-proc_loadavg-fix-ABBA-deadlock-between-read-refresh.patch +Patch9026: 0026-proc-Fix-proc-cpuinfo-not-respecting-personality.patch +Patch9027: 0027-proc_loadavg.c-Fix-incompatible-integer-to-pointer-c.patch +Patch9028: 0028-proc-fix-MemAvailable-in-proc-meminfo-to-exclude-tmp.patch +Patch9029: 0029-proc-Fix-swap-handling-for-cgroups-v2-can_use_swap.patch +Patch9030: 0030-proc-Fix-swap-handling-for-cgroups-v2-zero-limits.patch +Patch9031: 0031-proc-checks-system-security-policy-before-trying-to-.patch +Patch9032: 0032-utils-fix-wait_for_sock-to-use-time_t-instead-of-int.patch +Patch9033: 0033-cpuset_parse-make-a-check-for-an-empty-string-in-cpu.patch #Dependency BuildRequires: autoconf automake libtool help2man @@ -100,6 +109,12 @@ fi %{_unitdir}/* %changelog +* Fri Jul 18 2025 wujichao - 4.0.11-10 +- Type:bugfix +- CVE:NA +- SUG:NA +- DESC:backport patches from upstream + * Fri Dec 15 2023 yangjiaqi - 4.0.11-9 - Type:bugfix - CVE:NA