From a7ae35d65d62fa7ebc70dac2fb4984a972cd8021 Mon Sep 17 00:00:00 2001 From: zhangxiaoyu Date: Wed, 13 Sep 2023 16:20:52 +0800 Subject: [PATCH] fix run container failed when enable isulad Signed-off-by: zhangxiaoyu --- 0006-remove-isulad_cgfsng.patch | 4157 +++++++++++++++++ ...-container-failed-when-enable-isulad.patch | 1060 +++++ lxc.spec | 12 +- 3 files changed, 5227 insertions(+), 2 deletions(-) create mode 100644 0006-remove-isulad_cgfsng.patch create mode 100644 0007-fix-run-container-failed-when-enable-isulad.patch diff --git a/0006-remove-isulad_cgfsng.patch b/0006-remove-isulad_cgfsng.patch new file mode 100644 index 0000000..d726a17 --- /dev/null +++ b/0006-remove-isulad_cgfsng.patch @@ -0,0 +1,4157 @@ +From 8db83e2f87cc5377b7a0d3a895d05df37c4abba1 Mon Sep 17 00:00:00 2001 +From: zhangxiaoyu +Date: Wed, 18 Oct 2023 11:01:26 +0800 +Subject: [PATCH 1/2] remove isulad_cgfsng + +Signed-off-by: zhangxiaoyu +--- + src/lxc/cgroups/isulad_cgfsng.c | 4137 ------------------------------- + 1 file changed, 4137 deletions(-) + delete mode 100644 src/lxc/cgroups/isulad_cgfsng.c + +diff --git a/src/lxc/cgroups/isulad_cgfsng.c b/src/lxc/cgroups/isulad_cgfsng.c +deleted file mode 100644 +index 1160af5..0000000 +--- a/src/lxc/cgroups/isulad_cgfsng.c ++++ /dev/null +@@ -1,4137 +0,0 @@ +-/****************************************************************************** +- * Copyright (c) Huawei Technologies Co., Ltd. 2019. All rights reserved. +- * Author: lifeng +- * Create: 2020-11-02 +- * Description: provide container definition +- * lxc: linux Container library +- * This library is free software; you can redistribute it and/or +- * modify it under the terms of the GNU Lesser General Public +- * License as published by the Free Software Foundation; either +- * version 2.1 of the License, or (at your option) any later version. +- * +- * This library is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- * Lesser General Public License for more details. +- * +- * You should have received a copy of the GNU Lesser General Public +- * License along with this library; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +- ******************************************************************************/ +- +-#ifndef _GNU_SOURCE +-#define _GNU_SOURCE 1 +-#endif +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "af_unix.h" +-#include "caps.h" +-#include "cgroup.h" +-#include "cgroup2_devices.h" +-#include "cgroup_utils.h" +-#include "commands.h" +-#include "commands_utils.h" +-#include "conf.h" +-#include "config.h" +-#include "log.h" +-#include "macro.h" +-#include "mainloop.h" +-#include "memory_utils.h" +-#include "open_utils.h" +-#include "storage/storage.h" +-#include "utils.h" +- +-#if !HAVE_STRLCPY +-#include "include/strlcpy.h" +-#endif +- +-#if !HAVE_STRLCAT +-#include "include/strlcat.h" +-#endif +- +-#if HAVE_LIBSYSTEMD +-#include +-#include +-#endif +- +-lxc_log_define(isulad_cgfsng, cgroup); +- +-/* +- * Given a pointer to a null-terminated array of pointers, realloc to add one +- * entry, and point the new entry to NULL. Do not fail. Return the index to the +- * second-to-last entry - that is, the one which is now available for use +- * (keeping the list null-terminated). +- */ +-static int cg_list_add(void ***list) +-{ +- int idx = 0; +- void **p; +- +- if (*list) +- for (; (*list)[idx]; idx++) +- ; +- +- p = realloc(*list, (idx + 2) * sizeof(void **)); +- if (!p) +- return ret_errno(ENOMEM); +- +- p[idx + 1] = NULL; +- *list = p; +- +- return idx; +-} +- +-/* Given a null-terminated array of strings, check whether @entry is one of the +- * strings. +- */ +-static bool string_in_list(char **list, const char *entry) +-{ +- if (!list) +- return false; +- +- for (int i = 0; list[i]; i++) +- if (strcmp(list[i], entry) == 0) +- return true; +- +- return false; +-} +- +-/* Given a handler's cgroup data, return the struct hierarchy for the controller +- * @c, or NULL if there is none. +- */ +-static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller) +-{ +- if (!ops->hierarchies) +- return log_trace_errno(NULL, errno, "There are no useable cgroup controllers"); +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- if (!controller) { +- /* This is the empty unified hierarchy. */ +- if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0]) +- return ops->hierarchies[i]; +- +- continue; +- } +- +- /* +- * Handle controllers with significant implementation changes +- * from cgroup to cgroup2. +- */ +- if (pure_unified_layout(ops)) { +- if (strequal(controller, "devices")) { +- if (device_utility_controller(ops->unified)) +- return ops->unified; +- +- break; +- } else if (strequal(controller, "freezer")) { +- if (freezer_utility_controller(ops->unified)) +- return ops->unified; +- +- break; +- } +- } +- +- if (string_in_list(ops->hierarchies[i]->controllers, controller)) +- return ops->hierarchies[i]; +- } +- +- if (controller) +- WARN("There is no useable %s controller", controller); +- else +- WARN("There is no empty unified cgroup hierarchy"); +- +- return ret_set_errno(NULL, ENOENT); +-} +- +-int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit) +-{ +- int dfd; +- const struct hierarchy *h; +- +- h = get_hierarchy(ops, fd->controller); +- if (!h) +- return ret_errno(ENOENT); +- +- /* +- * The client requested that the controller must be in a specific +- * cgroup version. +- */ +- if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type) +- return ret_errno(EINVAL); +- +- if (limit) +- dfd = h->dfd_con; +- else +- dfd = h->dfd_lim; +- if (dfd < 0) +- return ret_errno(EBADF); +- +- fd->layout = ops->cgroup_layout; +- fd->type = h->fs_type; +- if (fd->type == UNIFIED_HIERARCHY) +- fd->utilities = h->utilities; +- fd->fd = dfd; +- +- return 0; +-} +- +-#define BATCH_SIZE 50 +-static void batch_realloc(char **mem, size_t oldlen, size_t newlen) +-{ +- int newbatches = (newlen / BATCH_SIZE) + 1; +- int oldbatches = (oldlen / BATCH_SIZE) + 1; +- +- if (!*mem || newbatches > oldbatches) +- *mem = must_realloc(*mem, newbatches * BATCH_SIZE); +-} +- +-static void append_line(char **dest, size_t oldlen, char *new, size_t newlen) +-{ +- size_t full = oldlen + newlen; +- +- batch_realloc(dest, oldlen, full + 1); +- +- memcpy(*dest + oldlen, new, newlen + 1); +-} +- +-/* Slurp in a whole file */ +-static char *read_file(const char *fnam) +-{ +- __do_free char *buf = NULL, *line = NULL; +- __do_fclose FILE *f = NULL; +- size_t len = 0, fulllen = 0; +- int linelen; +- +- f = fopen(fnam, "re"); +- if (!f) +- return NULL; +- +- while ((linelen = getline(&line, &len, f)) != -1) { +- append_line(&buf, fulllen, line, linelen); +- fulllen += linelen; +- } +- +- return move_ptr(buf); +-} +- +-static inline bool is_unified_hierarchy(const struct hierarchy *h) +-{ +- return h->fs_type == UNIFIED_HIERARCHY; +-} +- +-static char *trim(char *s) +-{ +- size_t len; +- +- len = strlen(s); +- while ((len > 1) && (s[len - 1] == '\n')) +- s[--len] = '\0'; +- +- return s; +-} +- +-/* Return true if the controller @entry is found in the null-terminated list of +- * hierarchies @hlist. +- */ +-static bool controller_available(struct hierarchy **hlist, char *entry) +-{ +- if (!hlist) +- return false; +- +- for (int i = 0; hlist[i]; i++) +- if (string_in_list(hlist[i]->controllers, entry)) +- return true; +- +- return false; +-} +- +-static bool controllers_available(struct cgroup_ops *ops) +-{ +- struct hierarchy **hlist; +- +- if (!ops->cgroup_use) +- return true; +- +- hlist = ops->hierarchies; +- for (char **cur = ops->cgroup_use; cur && *cur; cur++) +- if (!controller_available(hlist, *cur)) +- return log_error(false, "The %s controller found", *cur); +- +- return true; +-} +- +-static char **list_new(void) +-{ +- __do_free_string_list char **list = NULL; +- int idx; +- +- idx = cg_list_add((void ***)&list); +- if (idx < 0) +- return NULL; +- +- list[idx] = NULL; +- return move_ptr(list); +-} +- +-static int list_add_string(char ***list, char *entry) +-{ +- __do_free char *dup = NULL; +- int idx; +- +- dup = strdup(entry); +- if (!dup) +- return ret_errno(ENOMEM); +- +- idx = cg_list_add((void ***)list); +- if (idx < 0) +- return idx; +- +- (*list)[idx] = move_ptr(dup); +- return 0; +-} +- +-static char **list_add_controllers(char *controllers) +-{ +- __do_free_string_list char **list = NULL; +- char *it; +- +- lxc_iterate_parts(it, controllers, ", \t\n") { +- int ret; +- +- ret = list_add_string(&list, it); +- if (ret < 0) +- return NULL; +- } +- +- return move_ptr(list); +-} +- +-static char **unified_controllers(int dfd, const char *file) +-{ +- __do_free char *buf = NULL; +- +- buf = read_file_at(dfd, file, PROTECT_OPEN, 0); +- if (!buf) +- return NULL; +- +- return list_add_controllers(buf); +-} +- +-static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers) +-{ +- if (!ops->cgroup_use) +- return false; +- +- for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) { +- bool found = false; +- +- for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) { +- if (!strequal(*cur_use, *cur_ctrl)) +- continue; +- +- found = true; +- break; +- } +- +- if (found) +- continue; +- +- return true; +- } +- +- return false; +-} +- +-static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt, +- int dfd_base, char *base_cgroup, +- char **controllers, cgroupfs_type_magic_t fs_type) +-{ +- __do_free struct hierarchy *new = NULL; +- int idx; +- +- if (abspath(base_cgroup)) +- return syserror_set(-EINVAL, "Container base path must be relative to controller mount"); +- +- new = zalloc(sizeof(*new)); +- if (!new) +- return ret_errno(ENOMEM); +- +- new->dfd_con = -EBADF; +- new->dfd_lim = -EBADF; +- new->dfd_mon = -EBADF; +- +- new->fs_type = fs_type; +- new->controllers = controllers; +- new->at_mnt = mnt; +- new->at_base = base_cgroup; +- +- new->dfd_mnt = dfd_mnt; +- new->dfd_base = dfd_base; +- +- TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s", +- mnt, maybe_empty(base_cgroup)); +- for (char *const *it = new->controllers; it && *it; it++) +- TRACE("The hierarchy contains the %s controller", *it); +- +- idx = cg_list_add((void ***)&ops->hierarchies); +- if (idx < 0) +- return ret_errno(idx); +- +- if (fs_type == UNIFIED_HIERARCHY) +- ops->unified = new; +- (ops->hierarchies)[idx] = move_ptr(new); +- +- return 0; +-} +- +-struct generic_userns_exec_data { +- struct hierarchy **hierarchies; +- const char *path_prune; +- struct lxc_conf *conf; +- uid_t origuid; /* target uid in parent namespace */ +- char *path; +-}; +- +-static int isulad_cgroup_tree_remove(struct hierarchy **hierarchies, +- const char *container_cgroup) +-{ +- if (!container_cgroup || !hierarchies) +- return 0; +- +- for (int i = 0; hierarchies[i]; i++) { +- struct hierarchy *h = hierarchies[i]; +- int ret; +- +- if (!h->path_con) { +- h->path_con = must_make_path(h->at_mnt, h->at_base, container_cgroup, NULL); +- } +- +- ret = lxc_rm_rf(h->path_con); +- if (ret < 0) { +- if (errno == ENOENT) { +- WARN("Destroy path: \"%s\" do not exist", h->path_con); +- return 0; +- } +- SYSERROR("Failed to destroy \"%s\"", h->path_con); +- return -1; +- } +- +- free_disarm(h->path_con); +- } +- +- return 0; +-} +- +-static int isulad_cgroup_tree_remove_wrapper(void *data) +-{ +- struct generic_userns_exec_data *arg = data; +- uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; +- gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; +- int ret; +- +- if (!lxc_drop_groups() && errno != EPERM) +- return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); +- +- ret = setresgid(nsgid, nsgid, nsgid); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", +- (int)nsgid, (int)nsgid, (int)nsgid); +- +- ret = setresuid(nsuid, nsuid, nsuid); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", +- (int)nsuid, (int)nsuid, (int)nsuid); +- +- return isulad_cgroup_tree_remove(arg->hierarchies, arg->path_prune); +-} +- +-__cgfsng_ops static bool isulad_cgfsng_payload_destroy(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- int ret; +- +- if (!ops) { +- ERROR("Called with uninitialized cgroup operations"); +- return false; +- } +- +- if (ops->no_controller) { +- DEBUG("no controller found, ignore isulad_cgfsng_payload_destroy"); +- return true; +- } +- +- if (!ops->hierarchies) { +- DEBUG("no hierarchies found, ignore isulad_cgfsng_payload_destroy"); +- return true; +- } +- +- if (!handler) { +- ERROR("Called with uninitialized handler"); +- return false; +- } +- +- if (!handler->conf) { +- ERROR("Called with uninitialized conf"); +- return false; +- } +- +-#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +- ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices); +- if (ret < 0) +- WARN("Failed to detach bpf program from cgroup"); +-#endif +- +- if (!list_empty(&handler->conf->id_map) && !handler->am_root) { +- struct generic_userns_exec_data wrap = { +- .conf = handler->conf, +- .path_prune = ops->container_limit_cgroup, +- .hierarchies = ops->hierarchies, +- .origuid = 0, +- }; +- ret = userns_exec_1(handler->conf, isulad_cgroup_tree_remove_wrapper, +- &wrap, "cgroup_tree_remove_wrapper"); +- } else { +- ret = isulad_cgroup_tree_remove(ops->hierarchies, ops->container_cgroup); +- } +- if (ret < 0) { +- SYSWARN("Failed to destroy cgroups"); +- return false; +- } +- +- return true; +-} +- +-__cgfsng_ops static void isulad_cgfsng_monitor_destroy(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- return; +-} +- +-#define SYSTEMD_SCOPE_FAILED 2 +-#define SYSTEMD_SCOPE_UNSUPP 1 +-#define SYSTEMD_SCOPE_SUCCESS 0 +- +-#if HAVE_LIBSYSTEMD +-struct sd_callback_data { +- char *scope_name; +- bool job_complete; +-}; +- +-static int systemd_jobremoved_callback(sd_bus_message *m, void *userdata, sd_bus_error *error) +-{ +- char *path, *unit, *result; +- struct sd_callback_data *sd_data = userdata; +- uint32_t id; +- int r; +- +- r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result); +- if (r < 0) +- return log_error(-1, "bad message received in callback: %s", strerror(-r)); +- +- if (sd_data->scope_name && strcmp(unit, sd_data->scope_name) != 0) +- return log_trace(-1, "unit was '%s' not '%s'", unit, sd_data->scope_name); +- if (strcmp(result, "done") == 0) { +- sd_data->job_complete = true; +- return log_info(1, "job is done"); +- } +- return log_debug(0, "result was '%s', not 'done'", result); +-} +- +-#define DESTINATION "org.freedesktop.systemd1" +-#define PATH "/org/freedesktop/systemd1" +-#define INTERFACE "org.freedesktop.systemd1.Manager" +-#define MEMBER "StartTransientUnit" +-static bool start_scope(sd_bus *bus, struct sd_callback_data *data, struct sd_event *event) +-{ +- __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; +- __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; +- __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; +- char *path = NULL; +- int r; +- +- r = sd_bus_message_new_method_call(bus, &m, +- DESTINATION, PATH, INTERFACE, MEMBER); +- if (r < 0) +- return log_error(false, "Failed creating sdbus message"); +- +- r = sd_bus_message_append(m, "ss", data->scope_name, "fail"); +- if (r < 0) +- return log_error(false, "Failed setting systemd scope name"); +- +- r = sd_bus_message_open_container(m, 'a', "(sv)"); +- if (r < 0) +- return log_error(false, "Failed allocating sdbus msg properties"); +- +- r = sd_bus_message_append(m, "(sv)(sv)(sv)", +- "PIDs", "au", 1, getpid(), +- "Delegate", "b", 1, +- "CollectMode", "s", "inactive-or-failed"); +- if (r < 0) +- return log_error(false, "Failed setting properties on sdbus message"); +- +- r = sd_bus_message_close_container(m); +- if (r < 0) +- return log_error(false, "Failed closing sdbus message properties"); +- +- r = sd_bus_message_append(m, "a(sa(sv))", 0); +- if (r < 0) +- return log_error(false, "Failed appending aux boilerplate\n"); +- +- r = sd_bus_call(NULL, m, 0, &error, &reply); +- if (r < 0) +- return log_error(false, "Failed sending sdbus message: %s", error.message); +- +- /* Parse the response message */ +- r = sd_bus_message_read(reply, "o", &path); +- if (r < 0) +- return log_error(false, "Failed to parse response message: %s", strerror(-r)); +- +- /* Now spin up a mini-event-loop to wait for the "job completed" message */ +- int tries = 0; +- +- while (!data->job_complete) { +- r = sd_event_run(event, 1000 * 1000); +- if (r < 0) { +- log_debug(stderr, "Error waiting for JobRemoved: %s\n", strerror(-r)); +- continue; +- } +- if (data->job_complete || tries == 5) +- break; +- if (r > 0) { +- log_trace(stderr, "Debug: we processed an event (%d), but not the one we wanted\n", r); +- continue; +- } +- if (r == 0) // timeout +- tries++; +- } +- if (!data->job_complete) { +- return log_error(false, "Error: %s job was never removed", data->scope_name); +- } +- return true; +-} +- +-static bool string_pure_unified_system(char *contents) +-{ +- char *p; +- bool first_line_read = false; +- +- lxc_iterate_parts(p, contents, "\n") { +- if (first_line_read) // if >1 line, this is not pure unified +- return false; +- first_line_read = true; +- +- if (strlen(p) > 3 && strncmp(p, "0:", 2) == 0) +- return true; +- } +- +- return false; +-} +- +-/* +- * Only call get_current_unified_cgroup() when we are in a pure +- * unified (v2-only) cgroup +- */ +-static char *get_current_unified_cgroup(void) +-{ +- __do_free char *buf = NULL; +- __do_free_string_list char **list = NULL; +- char *p; +- +- buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); +- if (!buf) +- return NULL; +- +- if (!string_pure_unified_system(buf)) +- return NULL; +- +- // 0::/user.slice/user-1000.slice/session-136.scope +- // Get past the "0::" +- p = buf; +- if (strnequal(p, "0::", STRLITERALLEN("0::"))) +- p += STRLITERALLEN("0::"); +- +- return strdup(p); +-} +- +-static bool pure_unified_system(void) +-{ +- __do_free char *buf = NULL; +- +- buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); +- if (!buf) +- return false; +- +- return string_pure_unified_system(buf); +-} +- +-#define MEMBER_JOIN "AttachProcessesToUnit" +-static bool enter_scope(char *scope_name, pid_t pid) +-{ +- __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; +- __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; +- __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; +- __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; +- int r; +- +- r = sd_bus_open_user(&bus); +- if (r < 0) +- return log_error(false, "Failed to connect to user bus: %s", strerror(-r)); +- +- r = sd_bus_message_new_method_call(bus, &m, +- DESTINATION, PATH, INTERFACE, MEMBER_JOIN); +- if (r < 0) +- return log_error(false, "Failed creating sdbus message"); +- +- r = sd_bus_message_append(m, "ssau", scope_name, "/init", 1, pid); +- if (r < 0) +- return log_error(false, "Failed setting systemd scope name"); +- +- +- r = sd_bus_call(NULL, m, 0, &error, &reply); +- if (r < 0) +- return log_error(false, "Failed sending sdbus message: %s", error.message); +- +- return true; +-} +- +-static bool enable_controllers_delegation(int fd_dir, char *cg) +-{ +- __do_free char *rbuf = NULL; +- __do_free char *wbuf = NULL; +- __do_free_string_list char **cpulist = NULL; +- char *controller; +- size_t full_len = 0; +- bool first = true; +- int ret; +- +- rbuf = read_file_at(fd_dir, "cgroup.controllers", PROTECT_OPEN, 0); +- if (!rbuf) +- return false; +- +- lxc_iterate_parts(controller, rbuf, " ") { +- full_len += strlen(controller) + 2; +- wbuf = must_realloc(wbuf, full_len + 1); +- if (first) { +- wbuf[0] = '\0'; +- first = false; +- } else { +- (void)strlcat(wbuf, " ", full_len + 1); +- } +- strlcat(wbuf, "+", full_len + 1); +- strlcat(wbuf, controller, full_len + 1); +- } +- if (!wbuf) +- return log_debug(true, "No controllers to delegate!"); +- +- ret = lxc_writeat(fd_dir, "cgroup.subtree_control", wbuf, strlen(wbuf)); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to write \"%s\" to %s/cgroup.subtree_control", wbuf, cg); +- +- return true; +-} +- +-/* +- * systemd places us in say .../lxc-1.scope. We create lxc-1.scope/init, +- * move ourselves to there, then enable controllers in lxc-1.scope +- */ +-static bool move_and_delegate_unified(char *parent_cgroup) +-{ +- __do_free char *buf = NULL; +- __do_close int fd_parent = -EBADF; +- int ret; +- +- fd_parent = open_at(-EBADF, parent_cgroup, O_DIRECTORY, 0, 0); +- if (fd_parent < 0) +- return syserror_ret(false, "Failed opening cgroup dir \"%s\"", parent_cgroup); +- +- ret = mkdirat(fd_parent, "init", 0755); +- if (ret < 0 && errno != EEXIST) +- return syserror_ret(false, "Failed to create \"%d/init\" cgroup", fd_parent); +- +- buf = read_file_at(fd_parent, "cgroup.procs", PROTECT_OPEN, 0); +- if (!buf) +- return false; +- +- ret = lxc_writeat(fd_parent, "init/cgroup.procs", buf, strlen(buf)); +- if (ret) +- return syserror_ret(false, "Failed to escape to cgroup \"init/cgroup.procs\""); +- +- /* enable controllers in parent_cgroup */ +- return enable_controllers_delegation(fd_parent, parent_cgroup); +-} +- +-static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) +-{ +- __do_free char *full_scope_name = NULL; +- __do_free char *fs_cg_path = NULL; +- sd_event *event = NULL; +- __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; // free the bus before the names it references, just to be sure +- struct sd_callback_data sd_data; +- int idx = 0; +- size_t len; +- int r; +- +- if (geteuid() == 0) +- return log_info(SYSTEMD_SCOPE_UNSUPP, "Running privileged, not using a systemd unit"); +- // Pure_unified_layout() can't be used as that info is not yet setup. At +- // the same time, we don't want to calculate current cgroups until after +- // we optionally enter a new systemd user scope. So let's just do a quick +- // check for pure unified cgroup system: single line /proc/self/cgroup with +- // only index '0:' +- if (!pure_unified_system()) +- return log_info(SYSTEMD_SCOPE_UNSUPP, "Not in unified layout, not using a systemd unit"); +- +- r = sd_bus_open_user(&bus); +- if (r < 0) +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed to connect to user bus: %s", strerror(-r)); +- +- r = sd_bus_call_method_async(bus, NULL, DESTINATION, PATH, INTERFACE, "Subscribe", NULL, NULL, NULL); +- if (r < 0) +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed to subscribe to signals: %s", strerror(-r)); +- +- sd_data.job_complete = false; +- sd_data.scope_name = NULL; +- r = sd_bus_match_signal(bus, +- NULL, // no slot +- DESTINATION, PATH, INTERFACE, "JobRemoved", +- systemd_jobremoved_callback, &sd_data); +- if (r < 0) +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed to register systemd event loop signal handler: %s", strerror(-r)); +- +- // NEXT: create and attach event +- r = sd_event_new(&event); +- if (r < 0) +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed allocating new event: %s\n", strerror(-r)); +- r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL); +- if (r < 0) { +- // bus won't clean up event since the attach failed +- sd_event_unrefp(&event); +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed attaching event: %s\n", strerror(-r)); +- } +- +- // "lxc-" + (conf->name) + "-NN" + ".scope" + '\0' +- len = STRLITERALLEN("lxc-") + strlen(conf->name) + 3 + STRLITERALLEN(".scope") + 1; +- full_scope_name = malloc(len); +- if (!full_scope_name) +- return syserror("Out of memory"); +- +- do { +- r = strnprintf(full_scope_name, len, "lxc-%s-%d.scope", conf->name, idx); +- if (r < 0) +- return log_error_errno(-1, errno, "Failed to build scope name for \"%s\"", conf->name); +- sd_data.scope_name = full_scope_name; +- if (start_scope(bus, &sd_data, event)) { +- conf->cgroup_meta.systemd_scope = get_current_unified_cgroup(); +- if (!conf->cgroup_meta.systemd_scope) +- return log_trace(SYSTEMD_SCOPE_FAILED, "Out of memory"); +- fs_cg_path = must_make_path("/sys/fs/cgroup", conf->cgroup_meta.systemd_scope, NULL); +- if (!move_and_delegate_unified(fs_cg_path)) +- return log_error(SYSTEMD_SCOPE_FAILED, "Failed delegating the controllers to our cgroup"); +- return log_trace(SYSTEMD_SCOPE_SUCCESS, "Created systemd scope %s", full_scope_name); +- } +- idx++; +- } while (idx < 99); +- +- return SYSTEMD_SCOPE_FAILED; // failed, let's try old-school after all +-} +-#else /* !HAVE_LIBSYSTEMD */ +-static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) +-{ +- TRACE("unpriv_systemd_create_scope: no systemd support"); +- return SYSTEMD_SCOPE_UNSUPP; // not supported +-} +-#endif /* HAVE_LIBSYSTEMD */ +- +-// Return a duplicate of cgroup path @cg without leading /, so +-// that caller can own+free it and be certain it's not abspath. +-static char *cgroup_relpath(char *cg) +-{ +- char *p; +- +- if (!cg || strequal(cg, "/")) +- return NULL; +- p = strdup(deabs(cg)); +- if (!p) +- return ERR_PTR(-ENOMEM); +- +- return p; +-} +- +-__cgfsng_ops static inline bool isulad_cgfsng_monitor_create(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- return true; +-} +- +-static bool isulad_copy_parent_file(char *path, char *file) +-{ +- int ret; +- int len = 0; +- char *value = NULL; +- char *current = NULL; +- char *fpath = NULL; +- char *lastslash = NULL; +- char oldv; +- +- fpath = must_make_path(path, file, NULL); +- current = read_file(fpath); +- +- if (current == NULL) { +- SYSERROR("Failed to read file \"%s\"", fpath); +- free(fpath); +- return false; +- } +- +- if (strcmp(current, "\n") != 0) { +- free(fpath); +- free(current); +- return true; +- } +- +- free(fpath); +- free(current); +- +- lastslash = strrchr(path, '/'); +- if (lastslash == NULL) { +- ERROR("Failed to detect \"/\" in \"%s\"", path); +- return false; +- } +- oldv = *lastslash; +- *lastslash = '\0'; +- fpath = must_make_path(path, file, NULL); +- *lastslash = oldv; +- len = lxc_read_from_file(fpath, NULL, 0); +- if (len <= 0) +- goto on_error; +- +- value = must_realloc(NULL, len + 1); +- ret = lxc_read_from_file(fpath, value, len); +- if (ret != len) +- goto on_error; +- free(fpath); +- +- fpath = must_make_path(path, file, NULL); +- ret = lxc_write_to_file(fpath, value, len, false, 0666); +- if (ret < 0) +- SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath); +- free(fpath); +- free(value); +- return ret >= 0; +- +-on_error: +- SYSERROR("Failed to read file \"%s\"", fpath); +- free(fpath); +- free(value); +- return false; +-} +- +-static bool build_sub_cpuset_cgroup_dir(char *cgpath) +-{ +- int ret; +- +- ret = mkdir_p(cgpath, 0755); +- if (ret < 0) { +- if (errno != EEXIST) { +- SYSERROR("Failed to create directory \"%s\"", cgpath); +- return false; +- } +- } +- +- /* copy parent's settings */ +- if (!isulad_copy_parent_file(cgpath, "cpuset.cpus")) { +- SYSERROR("Failed to copy \"cpuset.cpus\" settings"); +- return false; +- } +- +- /* copy parent's settings */ +- if (!isulad_copy_parent_file(cgpath, "cpuset.mems")) { +- SYSERROR("Failed to copy \"cpuset.mems\" settings"); +- return false; +- } +- +- return true; +-} +- +-static bool isulad_cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname) +-{ +- char *cgpath, *slash; +- bool sub_mk_success = false; +- +- if (is_unified_hierarchy(h)) +- return true; +- +- if (!string_in_list(h->controllers, "cpuset")) +- return true; +- +- cgname += strspn(cgname, "/"); +- +- slash = strchr(cgname, '/'); +- +- if (slash != NULL) { +- while (slash) { +- *slash = '\0'; +- cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); +- sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); +- free(cgpath); +- *slash = '/'; +- if (!sub_mk_success) { +- return false; +- } +- slash = strchr(slash + 1, '/'); +- } +- } +- +- cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); +- sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); +- free(cgpath); +- if (!sub_mk_success) { +- return false; +- } +- +- return true; +-} +- +-static int isulad_mkdir_eexist_on_last(const char *dir, mode_t mode) +-{ +- const char *tmp = dir; +- const char *orig = dir; +- +- do { +- int ret; +- size_t cur_len; +- char *makeme; +- +- dir = tmp + strspn(tmp, "/"); +- tmp = dir + strcspn(dir, "/"); +- +- errno = ENOMEM; +- cur_len = dir - orig; +- makeme = strndup(orig, cur_len); +- if (!makeme) +- return -1; +- +- ret = mkdir(makeme, mode); +- if (ret < 0) { +- if (errno != EEXIST) { +- SYSERROR("Failed to create directory \"%s\"", makeme); +- free(makeme); +- return -1; +- } +- } +- free(makeme); +- +- } while (tmp != dir); +- +- return 0; +-} +- +-static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname, int errfd) +-{ +- int ret; +- __do_free char *path = NULL; +- +- path = must_make_path(h->at_mnt, h->at_base, cgname, NULL); +- +- if (file_exists(path)) { // it must not already exist +- ERROR("Cgroup path \"%s\" already exist.", path); +- lxc_write_error_message(errfd, "%s:%d: Cgroup path \"%s\" already exist.", +- __FILE__, __LINE__, path); +- return false; +- } +- +- if (!isulad_cg_legacy_handle_cpuset_hierarchy(h, cgname)) { +- ERROR("Failed to handle legacy cpuset controller"); +- return false; +- } +- +- ret = isulad_mkdir_eexist_on_last(path, 0755); +- if (ret < 0) { +- ERROR("Failed to create cgroup \"%s\"", path); +- return false; +- } +- +- h->dfd_con = lxc_open_dirfd(path); +- if (h->dfd_con < 0) +- return log_error_errno(false, errno, "Failed to open %s", path); +- +- if (h->path_con == NULL) { +- h->path_con = move_ptr(path); +- } +- +- return true; +-} +- +-/* isulad: create hierarchies path, if fail, return the error */ +-__cgfsng_ops static inline bool isulad_cgfsng_payload_create(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- int i; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- char *container_cgroup = ops->container_cgroup; +- +- if (!ops->hierarchies) +- return true; +- +-#ifdef HAVE_ISULAD +- if (ops->no_controller) { +- DEBUG("no controller found, isgnore isulad_cgfsng_payload_create"); +- return true; +- } +-#endif +- +- if (!container_cgroup) { +- ERROR("cgfsng_create container_cgroup is invalid"); +- return false; +- } +- +- for (i = 0; ops->hierarchies[i]; i++) { +- if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, ops->errfd)) { +- SYSERROR("Failed to create %s", ops->hierarchies[i]->path_con); +- return false; +- } +- } +- +- return true; +-} +- +-__cgfsng_ops static bool isulad_cgfsng_monitor_enter(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- return true; +-} +- +-__cgfsng_ops static bool isulad_cgfsng_payload_enter(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- int len; +- char pidstr[INTTYPE_TO_STRLEN(pid_t)]; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +-#ifdef HAVE_ISULAD +- if (ops->no_controller) { +- DEBUG("no controller found, isgnore isulad_cgfsng_payload_enter"); +- return true; +- } +-#endif +- +- if (!ops->hierarchies) +- return true; +- +- if (!ops->container_cgroup) +- return ret_set_errno(false, ENOENT); +- +- if (!handler || !handler->conf) +- return ret_set_errno(false, EINVAL); +- +- len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid); +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- int ret; +- char *fullpath; +- int retry_count = 0; +- int max_retry = 10; +- +- fullpath = must_make_path(ops->hierarchies[i]->path_con, +- "cgroup.procs", NULL); +-retry: +- ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); +- if (ret != 0) { +- if (retry_count < max_retry) { +- SYSERROR("Failed to enter cgroup \"%s\" with retry count:%d", fullpath, retry_count); +- (void)isulad_cg_legacy_handle_cpuset_hierarchy(ops->hierarchies[i], ops->container_cgroup); +- (void)isulad_mkdir_eexist_on_last(ops->hierarchies[i]->path_con, 0755); +- usleep(100 * 1000); /* 100 millisecond */ +- retry_count++; +- goto retry; +- } +- SYSERROR("Failed to enter cgroup \"%s\"", fullpath); +- free(fullpath); +- return false; +- } +- free(fullpath); +- } +- +- return true; +-} +- +-static int fchowmodat(int dirfd, const char *path, uid_t chown_uid, +- gid_t chown_gid, mode_t chmod_mode) +-{ +- int ret; +- +- ret = fchownat(dirfd, path, chown_uid, chown_gid, +- AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); +- if (ret < 0) +- return log_warn_errno(-1, +- errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )", +- dirfd, path, (int)chown_uid, +- (int)chown_gid); +- +- ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0); +- if (ret < 0) +- return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)", +- dirfd, path, (int)chmod_mode); +- +- return 0; +-} +- +-/* chgrp the container cgroups to container group. We leave +- * the container owner as cgroup owner. So we must make the +- * directories 775 so that the container can create sub-cgroups. +- * +- * Also chown the tasks and cgroup.procs files. Those may not +- * exist depending on kernel version. +- */ +-static int chown_cgroup_wrapper(void *data) +-{ +- int ret; +- uid_t destuid; +- struct generic_userns_exec_data *arg = data; +- uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; +- gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; +- +- if (!lxc_drop_groups() && errno != EPERM) +- return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); +- +- ret = setresgid(nsgid, nsgid, nsgid); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", +- (int)nsgid, (int)nsgid, (int)nsgid); +- +- ret = setresuid(nsuid, nsuid, nsuid); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", +- (int)nsuid, (int)nsuid, (int)nsuid); +- +- destuid = get_ns_uid(arg->origuid); +- if (destuid == LXC_INVALID_UID) +- destuid = 0; +- +- for (int i = 0; arg->hierarchies[i]; i++) { +- int dirfd = arg->hierarchies[i]->dfd_con; +- +- if (dirfd < 0) +- return syserror_set(-EBADF, "Invalid cgroup file descriptor"); +- +- (void)fchowmodat(dirfd, "", destuid, nsgid, 0775); +- +- /* +- * Failures to chown() these are inconvenient but not +- * detrimental We leave these owned by the container launcher, +- * so that container root can write to the files to attach. We +- * chmod() them 664 so that container systemd can write to the +- * files (which systemd in wily insists on doing). +- */ +- +- if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY) +- (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664); +- +- (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664); +- +- if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY) +- continue; +- +- for (char **p = arg->hierarchies[i]->delegate; p && *p; p++) +- (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664); +- } +- +- return 0; +-} +- +-__cgfsng_ops static bool isulad_cgfsng_chown(struct cgroup_ops *ops, +- struct lxc_conf *conf) +-{ +- struct generic_userns_exec_data wrap; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return true; +- +- if (!ops->container_cgroup) +- return ret_set_errno(false, ENOENT); +- +- if (!conf) +- return ret_set_errno(false, EINVAL); +- +- if (list_empty(&conf->id_map)) +- return true; +- +- wrap.origuid = geteuid(); +- wrap.path = NULL; +- wrap.hierarchies = ops->hierarchies; +- wrap.conf = conf; +- +- if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0) +- return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace"); +- +- return true; +-} +- +-__cgfsng_ops static void isulad_cgfsng_finalize(struct cgroup_ops *ops) +-{ +- if (!ops) +- return; +- +-#ifdef HAVE_ISULAD +- if (ops->no_controller) { +- DEBUG("no controller found, isgnore isulad_cgfsng_payload_finalize"); +- return; +- } +-#endif +- +- if (!ops->hierarchies) +- return; +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- struct hierarchy *h = ops->hierarchies[i]; +- +- /* Close all monitor cgroup file descriptors. */ +- close_prot_errno_disarm(h->dfd_mon); +- } +- /* Close the cgroup root file descriptor. */ +- close_prot_errno_disarm(ops->dfd_mnt); +- +- /* +- * The checking for freezer support should obviously be done at cgroup +- * initialization time but that doesn't work reliable. The freezer +- * controller has been demoted (rightly so) to a simple file located in +- * each non-root cgroup. At the time when the container is created we +- * might still be located in /sys/fs/cgroup and so checking for +- * cgroup.freeze won't tell us anything because this file doesn't exist +- * in the root cgroup. We could then iterate through /sys/fs/cgroup and +- * find an already existing cgroup and then check within that cgroup +- * for the existence of cgroup.freeze but that will only work on +- * systemd based hosts. Other init systems might not manage cgroups and +- * so no cgroup will exist. So we defer until we have created cgroups +- * for our container which means we check here. +- */ +- if (pure_unified_layout(ops) && +- !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK, +- AT_SYMLINK_NOFOLLOW)) { +- TRACE("Unified hierarchy supports freezer"); +- ops->unified->utilities |= FREEZER_CONTROLLER; +- } +-} +- +-/* cgroup-full:* is done, no need to create subdirs */ +-static inline bool cg_mount_needs_subdirs(int type) +-{ +- return !(type >= LXC_AUTO_CGROUP_FULL_RO); +-} +- +-/* After $rootfs/sys/fs/container/controller/the/cg/path has been created, +- * remount controller ro if needed and bindmount the cgroupfs onto +- * control/the/cg/path. +- */ +-static int cg_legacy_mount_controllers(int type, struct hierarchy *h, +- char *controllerpath, char *cgpath, +- const char *container_cgroup) +-{ +- __do_free char *sourcepath = NULL; +- int ret, remount_flags; +- int flags = MS_BIND; +- +- if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) { +- ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"", +- controllerpath, controllerpath); +- +- remount_flags = add_required_remount_flags(controllerpath, +- controllerpath, +- flags | MS_REMOUNT); +- ret = mount(controllerpath, controllerpath, "cgroup", +- remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY, +- NULL); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath); +- +- INFO("Remounted %s read-only", controllerpath); +- } +- +- sourcepath = must_make_path(h->at_mnt, h->at_base, +- container_cgroup, NULL); +- if (type == LXC_AUTO_CGROUP_RO) +- flags |= MS_RDONLY; +- +- ret = mount(sourcepath, cgpath, "cgroup", flags, NULL); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"", +- h->controllers[0], cgpath); +- INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath); +- +- if (flags & MS_RDONLY) { +- remount_flags = add_required_remount_flags(sourcepath, cgpath, +- flags | MS_REMOUNT); +- ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath); +- INFO("Remounted %s read-only", cgpath); +- } +- +- INFO("Completed second stage cgroup automounts for \"%s\"", cgpath); +- return 0; +-} +- +-/* __cgroupfs_mount +- * +- * Mount cgroup hierarchies directly without using bind-mounts. The main +- * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting +- * cgroups for the LXC_AUTO_CGROUP_FULL option. +- */ +-static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, +- struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs, +- const char *hierarchy_mnt) +-{ +- __do_close int fd_fs = -EBADF; +- unsigned int flags = 0; +- char *fstype; +- int ret; +- +- if (dfd_mnt_cgroupfs < 0) +- return ret_errno(EINVAL); +- +- flags |= MOUNT_ATTR_NOSUID; +- flags |= MOUNT_ATTR_NOEXEC; +- flags |= MOUNT_ATTR_NODEV; +- flags |= MOUNT_ATTR_RELATIME; +- +- if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || +- (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO) || +- (cgroup_automount_type == LXC_AUTO_CGROUP2_RO)) +- flags |= MOUNT_ATTR_RDONLY; +- +- if (is_unified_hierarchy(h)) +- fstype = "cgroup2"; +- else +- fstype = "cgroup"; +- +- if (can_use_mount_api()) { +- fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); +- if (fd_fs < 0) +- return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); +- +- if (!is_unified_hierarchy(h)) { +- for (const char **it = (const char **)h->controllers; it && *it; it++) { +- if (strnequal(*it, "name=", STRLITERALLEN("name="))) +- ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); +- else +- ret = fs_set_property(fd_fs, *it, ""); +- if (ret < 0) +- return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); +- } +- } +- +- ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, +- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, +- flags); +- } else { +- __do_free char *controllers = NULL, *target = NULL; +- unsigned int old_flags = 0; +- const char *rootfs_mnt; +- +- if (!is_unified_hierarchy(h)) { +- controllers = lxc_string_join(",", (const char **)h->controllers, false); +- if (!controllers) +- return ret_errno(ENOMEM); +- } +- +- rootfs_mnt = get_rootfs_mnt(rootfs); +- ret = mnt_attributes_old(flags, &old_flags); +- if (ret) +- return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); +- +- target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); +-#ifdef HAVE_ISULAD +- ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt, NULL); +-#else +- ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); +-#endif +- } +- if (ret < 0) +- return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", +- fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); +- +- DEBUG("Mounted cgroup filesystem %s onto %d(%s)", +- fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); +- return 0; +-} +- +-static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, +- struct lxc_rootfs *rootfs, +- int dfd_mnt_cgroupfs, const char *hierarchy_mnt) +-{ +- return __cgroupfs_mount(cgroup_automount_type, h, rootfs, +- dfd_mnt_cgroupfs, hierarchy_mnt); +-} +- +-static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h, +- struct lxc_rootfs *rootfs, +- int dfd_mnt_cgroupfs, +- const char *hierarchy_mnt) +-{ +- switch (cgroup_automount_type) { +- case LXC_AUTO_CGROUP_FULL_RO: +- break; +- case LXC_AUTO_CGROUP_FULL_RW: +- break; +- case LXC_AUTO_CGROUP_FULL_MIXED: +- break; +- default: +- return 0; +- } +- +- return __cgroupfs_mount(cgroup_automount_type, h, rootfs, +- dfd_mnt_cgroupfs, hierarchy_mnt); +-} +- +-/* __cg_mount_direct +- * +- * Mount cgroup hierarchies directly without using bind-mounts. The main +- * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting +- * cgroups for the LXC_AUTO_CGROUP_FULL option. +- */ +-static int __cg_mount_direct(int type, struct hierarchy *h, +- const char *controllerpath) +-{ +- __do_free char *controllers = NULL; +- char *fstype = "cgroup2"; +- unsigned long flags = 0; +- int ret; +- +- flags |= MS_NOSUID; +- flags |= MS_NOEXEC; +- flags |= MS_NODEV; +- flags |= MS_RELATIME; +- +- if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO) +- flags |= MS_RDONLY; +- +- if (h->fs_type != CGROUP2_SUPER_MAGIC) { +- controllers = lxc_string_join(",", (const char **)h->controllers, false); +- if (!controllers) +- return -ENOMEM; +- fstype = "cgroup"; +- } +- +- ret = mount("cgroup", controllerpath, fstype, flags, controllers); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s", +- controllerpath, fstype); +- +- DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype); +- return 0; +-} +- +-static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h, +- const char *controllerpath) +-{ +- return __cg_mount_direct(type, h, controllerpath); +-} +- +-static inline int cg_mount_cgroup_full(int type, struct hierarchy *h, +- const char *controllerpath) +-{ +- if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) +- return 0; +- +- return __cg_mount_direct(type, h, controllerpath); +-} +- +-__cgfsng_ops static bool isulad_cgfsng_mount(struct cgroup_ops *ops, +- struct lxc_handler *handler, int cg_flags) +-{ +- __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF; +- __do_free char *cgroup_root = NULL; +- int cgroup_automount_type; +- bool in_cgroup_ns = false, wants_force_mount = false; +- struct lxc_conf *conf = handler->conf; +- struct lxc_rootfs *rootfs = &conf->rootfs; +- const char *rootfs_mnt = get_rootfs_mnt(rootfs); +- int ret; +-#ifdef HAVE_ISULAD +- char **merged = NULL; +- __do_free char *systemdpath = NULL; +- __do_free char *unifiedpath = NULL; +-#endif +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return true; +- +- if (!conf) +- return ret_set_errno(false, EINVAL); +- +- if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0) +- return log_trace(true, "No cgroup mounts requested"); +- +- if (cg_flags & LXC_AUTO_CGROUP_FORCE) { +- cg_flags &= ~LXC_AUTO_CGROUP_FORCE; +- wants_force_mount = true; +- } +- +- switch (cg_flags) { +- case LXC_AUTO_CGROUP_RO: +- TRACE("Read-only cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP_RW: +- TRACE("Read-write cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP_MIXED: +- TRACE("Mixed cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP_FULL_RO: +- TRACE("Full read-only cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP_FULL_RW: +- TRACE("Full read-write cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP_FULL_MIXED: +- TRACE("Full mixed cgroup mounts requested"); +- break; +- case LXC_AUTO_CGROUP2_RW: +- TRACE("Read-write cgroup2 mount requested"); +- break; +- case LXC_AUTO_CGROUP2_RO: +- TRACE("Read-only cgroup2 mount requested"); +- break; +- default: +- return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified"); +- } +- cgroup_automount_type = cg_flags; +- +- if (!wants_force_mount) { +- wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf); +- +- /* +- * Most recent distro versions currently have init system that +- * do support cgroup2 but do not mount it by default unless +- * explicitly told so even if the host is cgroup2 only. That +- * means they often will fail to boot. Fix this by pre-mounting +- * cgroup2 by default. We will likely need to be doing this a +- * few years until all distros have switched over to cgroup2 at +- * which point we can safely assume that their init systems +- * will mount it themselves. +- */ +- if (pure_unified_layout(ops)) +- wants_force_mount = true; +- } +- +- if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP)) +- in_cgroup_ns = true; +- +- if (in_cgroup_ns && !wants_force_mount) +- return log_trace(true, "Mounting cgroups not requested or needed"); +- +- /* This is really the codepath that we want. */ +- if (pure_unified_layout(ops) || +- (cgroup_automount_type == LXC_AUTO_CGROUP2_RW) || +- (cgroup_automount_type == LXC_AUTO_CGROUP2_RO)) { +- __do_close int dfd_mnt_unified = -EBADF; +- +- if (!ops->unified) +- return log_error_errno(false, EINVAL, "No unified cgroup hierarchy mounted on the host"); +- +- dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, +- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); +- if (dfd_mnt_unified < 0) +- return syserror_ret(false, "Failed to open %d(%s)", +- rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); +- /* +- * If cgroup namespaces are supported but the container will +- * not have CAP_SYS_ADMIN after it has started we need to mount +- * the cgroups manually. +- * +- * Note that here we know that wants_force_mount is true. +- * Otherwise we would've returned early above. +- */ +- if (in_cgroup_ns) { +- /* +- * 1. cgroup:rw:force -> Mount the cgroup2 filesystem. +- * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only. +- * 3. cgroup:mixed:force -> See comment above how this +- * does not apply so +- * cgroup:mixed is equal to +- * cgroup:rw when cgroup +- * namespaces are supported. +- +- * 4. cgroup:rw -> No-op; init system responsible for mounting. +- * 5. cgroup:ro -> No-op; init system responsible for mounting. +- * 6. cgroup:mixed -> No-op; init system responsible for mounting. +- * +- * 7. cgroup-full:rw -> Not supported. +- * 8. cgroup-full:ro -> Not supported. +- * 9. cgroup-full:mixed -> Not supported. +- +- * 10. cgroup-full:rw:force -> Not supported. +- * 11. cgroup-full:ro:force -> Not supported. +- * 12. cgroup-full:mixed:force -> Not supported. +- * +- * 13. cgroup2 -> No-op; init system responsible for mounting. +- * 14. cgroup2:ro -> No-op; init system responsible for mounting. +- * 15. cgroup2:force -> Mount the cgroup2 filesystem read-write +- * 16. cgroup2:ro:force -> Mount the cgroup2 filesystem read-only +- */ +- ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, ""); +- if (ret < 0) +- return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace"); +- +- return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace"); +- } else { +- /* +- * Either no cgroup namespace supported (highly +- * unlikely unless we're dealing with a Frankenkernel. +- * Or the user requested to keep the cgroup namespace +- * of the host or another container. +- */ +- errno = EOPNOTSUPP; +- if (wants_force_mount) +- SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); +- else +- SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); +- } +- +- return syserror_ret(false, "Failed to mount cgroups"); +- } +- +- /* +- * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're +- * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the +- * DEFAULT_CGROUP_MOUNTPOINT define. +- */ +- if (can_use_mount_api()) { +- fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); +- if (fd_fs < 0) +- return log_error_errno(false, errno, "Failed to create new filesystem context for tmpfs"); +- +- ret = fs_set_property(fd_fs, "mode", "0755"); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); +- +- ret = fs_set_property(fd_fs, "size", "10240k"); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); +- +- ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, +- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, +- MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | +- MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); +- } else { +- cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); +- ret = safe_mount(NULL, cgroup_root, "tmpfs", +- MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, +- "size=10240k,mode=755", rootfs_mnt, handler->conf->rootfs.lsm_se_mount_context); +- } +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to mount tmpfs on %s", +- DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); +- +- dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, +- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); +- if (dfd_mnt_tmpfs < 0) +- return syserror_ret(false, "Failed to open %d(%s)", +- rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- __do_free char *hierarchy_mnt = NULL, *path2 = NULL; +- struct hierarchy *h = ops->hierarchies[i]; +- +-#ifdef HAVE_ISULAD +- // isulad: symlink subcgroup +- if (strchr(h->at_mnt, ',') != NULL) { +- int pret; +- pret = lxc_append_string(&merged, h->at_mnt); +- if (pret < 0) +- return false; +- } +-#endif +- +- ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); +-#ifdef HAVE_ISULAD +- if (ret < 0) { +- lxc_free_array((void **)merged, free); +- return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); +- } +-#else +- if (ret < 0) +- return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); +-#endif +- +- if (in_cgroup_ns && wants_force_mount) { +- /* +- * If cgroup namespaces are supported but the container +- * will not have CAP_SYS_ADMIN after it has started we +- * need to mount the cgroups manually. +- */ +- ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, +- dfd_mnt_tmpfs, h->at_mnt); +-#ifdef HAVE_ISULAD +- if (ret < 0) { +- lxc_free_array((void **)merged, free); +- return false; +- } +-#else +- if (ret < 0) +- return false; +-#endif +- continue; +- } +- +- /* Here is where the ancient kernel section begins. */ +- ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, +- dfd_mnt_tmpfs, h->at_mnt); +-#ifdef HAVE_ISULAD +- if (ret < 0) { +- lxc_free_array((void **)merged, free); +- return false; +- } +-#else +- if (ret < 0) +- return false; +-#endif +- +- if (!cg_mount_needs_subdirs(cgroup_automount_type)) +- continue; +- +- if (!cgroup_root) +- cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); +- +- hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); +-#ifdef HAVE_ISULAD +- // isulad: ignore ops->container_cgroup so we will not see directory lxc after /sys/fs/cgroup/xxx in container, +- // isulad: ignore h->container_base_path so we will not see subgroup of /sys/fs/cgroup/xxx/subgroup in container +- path2 = must_make_path(h->at_mnt, NULL); +-#else +- path2 = must_make_path(hierarchy_mnt, h->at_base, +- ops->container_cgroup, NULL); +-#endif +- ret = mkdir_p(path2, 0755); +-#ifdef HAVE_ISULAD +- if (ret < 0 && (errno != EEXIST)) { +- lxc_free_array((void **)merged, free); +- return false; +- } +-#else +- if (ret < 0 && (errno != EEXIST)) +- return false; +-#endif +- +- ret = cg_legacy_mount_controllers(cgroup_automount_type, h, +- hierarchy_mnt, path2, +- ops->container_cgroup); +-#ifdef HAVE_ISULAD +- if (ret < 0) { +- lxc_free_array((void **)merged, free); +- return false; +- } +-#else +- if (ret < 0) +- return false; +-#endif +- } +- +-#ifdef HAVE_ISULAD +- // isulad: symlink subcgroup +- if (merged) { +- char **mc = NULL; +- for (mc = merged; *mc; mc++) { +- char *token = NULL; +- char *copy = must_copy_string(*mc); +- lxc_iterate_parts(token, copy, ",") { +- int mret; +- char *link; +- link = must_make_path(cgroup_root, token, NULL); +- mret = symlink(*mc, link); +- if (mret < 0 && errno != EEXIST) { +- SYSERROR("Failed to create link %s for target %s", link, *mc); +- free(copy); +- free(link); +- lxc_free_array((void **)merged, free); +- return false; +- } +- free(link); +- } +- free(copy); +- } +- } +- +- // isulad: remount /sys/fs/cgroup to readonly +- if (cg_flags == LXC_AUTO_CGROUP_FULL_RO || cg_flags == LXC_AUTO_CGROUP_RO) { +- ret = mount(cgroup_root, cgroup_root, "bind", +- MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME|MS_RDONLY|MS_BIND|MS_REMOUNT, NULL); +- if (ret < 0) { +- SYSERROR("Failed to remount /sys/fs/cgroup."); +- lxc_free_array((void **)merged, free); +- return false; +- } +- } +- +- // isulad: remount /sys/fs/cgroup/systemd to readwrite for system container +- if (handler->conf->systemd != NULL && strcmp(handler->conf->systemd, "true") == 0) +- { +- unifiedpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/unified", NULL); +- if (dir_exists(unifiedpath)) +- { +- ret = umount2(unifiedpath, MNT_DETACH); +- if (ret < 0) +- { +- SYSERROR("Failed to umount /sys/fs/cgroup/unified."); +- lxc_free_array((void **)merged, free); +- return false; +- } +- } +- +- systemdpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/systemd", NULL); +- ret = mount(systemdpath, systemdpath, "bind", +- MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME | MS_BIND | MS_REMOUNT, NULL); +- if (ret < 0) +- { +- SYSERROR("Failed to remount /sys/fs/cgroup/systemd."); +- lxc_free_array((void **)merged, free); +- return false; +- } +- } +-#endif +- +- return true; +-} +- +-/* Only root needs to escape to the cgroup of its init. */ +-__cgfsng_ops static bool isulad_cgfsng_criu_escape(const struct cgroup_ops *ops, +- struct lxc_conf *conf) +-{ +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return true; +- +- if (!conf) +- return ret_set_errno(false, EINVAL); +- +- if (conf->cgroup_meta.relative || geteuid()) +- return true; +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- __do_free char *fullpath = NULL; +- int ret; +- +- fullpath = +- must_make_path(ops->hierarchies[i]->at_mnt, +- ops->hierarchies[i]->at_base, +- "cgroup.procs", NULL); +- ret = lxc_write_to_file(fullpath, "0", 2, false, 0666); +- if (ret != 0) +- return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath); +- } +- +- return true; +-} +- +-__cgfsng_ops static int isulad_cgfsng_criu_num_hierarchies(struct cgroup_ops *ops) +-{ +- int i = 0; +- +- if (!ops) +- return ret_set_errno(-1, ENOENT); +- +- if (!ops->hierarchies) +- return 0; +- +- for (; ops->hierarchies[i]; i++) +- ; +- +- return i; +-} +- +-__cgfsng_ops static bool isulad_cgfsng_criu_get_hierarchies(struct cgroup_ops *ops, int n, +- char ***out) +-{ +- int i; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return ret_set_errno(false, ENOENT); +- +- /* sanity check n */ +- for (i = 0; i < n; i++) +- if (!ops->hierarchies[i]) +- return ret_set_errno(false, ENOENT); +- +- *out = ops->hierarchies[i]->controllers; +- +- return true; +-} +- +-static bool cg_legacy_freeze(struct cgroup_ops *ops) +-{ +- struct hierarchy *h; +- +- h = get_hierarchy(ops, "freezer"); +- if (!h) +- return ret_set_errno(-1, ENOENT); +- +- return lxc_write_openat(h->path_con, "freezer.state", +- "FROZEN", STRLITERALLEN("FROZEN")); +-} +- +-static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata, +- struct lxc_async_descr *descr) +-{ +- __do_close int duped_fd = -EBADF; +- __do_free char *line = NULL; +- __do_fclose FILE *f = NULL; +- int state = PTR_TO_INT(cbdata); +- size_t len; +- const char *state_string; +- +- duped_fd = dup(fd); +- if (duped_fd < 0) +- return LXC_MAINLOOP_ERROR; +- +- if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1) +- return LXC_MAINLOOP_ERROR; +- +- f = fdopen(duped_fd, "re"); +- if (!f) +- return LXC_MAINLOOP_ERROR; +- move_fd(duped_fd); +- +- if (state == 1) +- state_string = "frozen 1"; +- else +- state_string = "frozen 0"; +- +- while (getline(&line, &len, f) != -1) +- if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0) +- return LXC_MAINLOOP_CLOSE; +- +- return LXC_MAINLOOP_CONTINUE; +-} +- +-static int cg_unified_freeze(struct cgroup_ops *ops, int timeout) +-{ +- __do_close int fd = -EBADF; +- call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; +- int ret; +- struct lxc_async_descr descr; +- struct hierarchy *h; +- +- h = ops->unified; +- if (!h) +- return ret_set_errno(-1, ENOENT); +- +- if (!h->path_con) +- return ret_set_errno(-1, EEXIST); +- +- if (timeout != 0) { +- __do_free char *events_file = NULL; +- +- events_file = must_make_path(h->path_con, "cgroup.events", NULL); +- fd = open(events_file, O_RDONLY | O_CLOEXEC); +- if (fd < 0) +- return log_error_errno(-1, errno, "Failed to open cgroup.events file"); +- +- ret = lxc_mainloop_open(&descr); +- if (ret) +- return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze"); +- +- /* automatically cleaned up now */ +- descr_ptr = &descr; +- +- ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, default_cleanup_handler, +- INT_TO_PTR((int){1}), "freezer_cgroup_events"); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); +- } +- +- ret = lxc_write_openat(h->path_con, "cgroup.freeze", "1", 1); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); +- +- if (timeout != 0 && lxc_mainloop(&descr, timeout)) +- return log_error_errno(-1, errno, "Failed to wait for container to be frozen"); +- +- return 0; +-} +- +-__cgfsng_ops static int isulad_cgfsng_freeze(struct cgroup_ops *ops, int timeout) +-{ +- if (!ops->hierarchies) +- return ret_set_errno(-1, ENOENT); +- +- if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) +- return cg_legacy_freeze(ops); +- +- return cg_unified_freeze(ops, timeout); +-} +- +-static int cg_legacy_unfreeze(struct cgroup_ops *ops) +-{ +- struct hierarchy *h; +- +- h = get_hierarchy(ops, "freezer"); +- if (!h) +- return ret_set_errno(-1, ENOENT); +- +- return lxc_write_openat(h->path_con, "freezer.state", +- "THAWED", STRLITERALLEN("THAWED")); +-} +- +-static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout) +-{ +- __do_close int fd = -EBADF; +- call_cleaner(lxc_mainloop_close)struct lxc_async_descr *descr_ptr = NULL; +- int ret; +- struct lxc_async_descr descr; +- struct hierarchy *h; +- +- h = ops->unified; +- if (!h) +- return ret_set_errno(-1, ENOENT); +- +- if (!h->path_con) +- return ret_set_errno(-1, EEXIST); +- +- if (timeout != 0) { +- __do_free char *events_file = NULL; +- +- events_file = must_make_path(h->path_con, "cgroup.events", NULL); +- fd = open(events_file, O_RDONLY | O_CLOEXEC); +- if (fd < 0) +- return log_error_errno(-1, errno, "Failed to open cgroup.events file"); +- +- ret = lxc_mainloop_open(&descr); +- if (ret) +- return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze"); +- +- /* automatically cleaned up now */ +- descr_ptr = &descr; +- +- ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, default_cleanup_handler, +- INT_TO_PTR((int){0}), "freezer_cgroup_events"); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); +- } +- +- ret = lxc_write_openat(h->path_con, "cgroup.freeze", "0", 1); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); +- +- if (timeout != 0 && lxc_mainloop(&descr, timeout)) +- return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen"); +- +- return 0; +-} +- +-__cgfsng_ops static int isulad_cgfsng_unfreeze(struct cgroup_ops *ops, int timeout) +-{ +- if (!ops->hierarchies) +- return ret_set_errno(-1, ENOENT); +- +- if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) +- return cg_legacy_unfreeze(ops); +- +- return cg_unified_unfreeze(ops, timeout); +-} +- +-__cgfsng_ops static const char *isulad_cgfsng_get_cgroup(struct cgroup_ops *ops, +- const char *controller) +-{ +- struct hierarchy *h; +- +- h = get_hierarchy(ops, controller); +- if (!h) +- return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", +- controller ? controller : "(null)"); +- +- if (!h->path_con) +- h->path_con = must_make_path(h->at_mnt, h->at_base, ops->container_cgroup, NULL); +- +- return h->path_con +- ? h->path_con + strlen(h->at_mnt) +- : NULL; +-} +- +-__cgfsng_ops static const char *isulad_cgfsng_get_cgroup_full_path(struct cgroup_ops *ops, +- const char *controller) +-{ +- struct hierarchy *h; +- +- h = get_hierarchy(ops, controller); +- if (!h) +- return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", +- controller ? controller : "(null)"); +- +- if (!h->path_con) +- h->path_con = must_make_path(h->at_mnt, h->at_base, ops->container_cgroup, NULL); +- +- return h->path_con; +-} +- +-/* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path, +- * which must be freed by the caller. +- */ +-static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h, +- const char *inpath, +- const char *filename) +-{ +- return must_make_path(h->at_mnt, inpath, filename, NULL); +-} +- +-static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid) +-{ +- int idx = 1; +- int ret; +- char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; +- size_t pidstr_len; +- +- /* Create leaf cgroup. */ +- ret = mkdirat(unified_fd, ".lxc", 0755); +- if (ret < 0 && errno != EEXIST) +- return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\""); +- +- pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); +- ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len); +- if (ret < 0) +- ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len); +- if (ret == 0) +- return 0; +- +- /* this is a non-leaf node */ +- if (errno != EBUSY) +- return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); +- +- do { +- bool rm = false; +- char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1]; +- char *slash; +- +- ret = snprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx); +- if (ret < 0 || (size_t)ret >= sizeof(attach_cgroup)) +- return ret_errno(EIO); +- +- /* +- * This shouldn't really happen but the compiler might complain +- * that a short write would cause a buffer overrun. So be on +- * the safe side. +- */ +- if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs")) +- return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun"); +- +- slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs"); +- *slash = '\0'; +- +- ret = mkdirat(unified_fd, attach_cgroup, 0755); +- if (ret < 0 && errno != EEXIST) +- return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup); +- if (ret == 0) +- rm = true; +- +- *slash = '/'; +- +- ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len); +- if (ret == 0) +- return 0; +- +- if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR)) +- SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup); +- +- /* this is a non-leaf node */ +- if (errno != EBUSY) +- return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); +- +- idx++; +- } while (idx < 1000); +- +- return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); +-} +- +-static int cgroup_attach_create_leaf(const struct lxc_conf *conf, +- int unified_fd, int *sk_fd, bool unprivileged) +-{ +- __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; +- int target_fds[2]; +- ssize_t ret; +- +- /* Create leaf cgroup. */ +- ret = mkdirat(unified_fd, ".lxc", 0755); +- if (ret < 0 && errno != EEXIST) +- return syserror("Failed to create leaf cgroup \".lxc\""); +- +- if (unprivileged) { +- target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); +- if (target_fd0 < 0) +- return syserror("Failed to open \".lxc/cgroup.procs\""); +- target_fds[0] = target_fd0; +- +- target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); +- if (target_fd1 < 0) +- return syserror("Failed to open \".lxc/cgroup.procs\""); +- target_fds[1] = target_fd1; +- +- ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0); +- if (ret <= 0) +- return syserror("Failed to send \".lxc/cgroup.procs\" fds %d and %d", +- target_fd0, target_fd1); +- +- TRACE("Sent cgroup file descriptors %d and %d", target_fd0, target_fd1); +- } else { +- ret = lxc_abstract_unix_send_credential(sk, NULL, 0); +- if (ret < 0) +- return syserror("Failed to inform parent that we are done setting up mounts"); +- +- TRACE("Informed parent process that cgroup has been created"); +- } +- +- return 0; +-} +- +-static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, +- const char *lxcpath, +- int unified_fd, int *sk_fd, pid_t pid, +- bool unprivileged) +-{ +- __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; +- char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; +- size_t pidstr_len; +-#if HAVE_LIBSYSTEMD +- __do_free char *scope = NULL; +-#endif +- ssize_t ret; +- +-#if HAVE_LIBSYSTEMD +- scope = lxc_cmd_get_systemd_scope(conf->name, lxcpath); +- if (scope) { +- TRACE("%s:%s is running under systemd-created scope '%s'. Attaching...", lxcpath, conf->name, scope); +- if (enter_scope(scope, pid)) +- TRACE("Successfully entered scope '%s'", scope); +- else +- ERROR("Failed entering scope '%s'", scope); +- } else { +- TRACE("%s:%s is not running under a systemd-created scope", lxcpath, conf->name); +- } +-#endif +- if (unprivileged) { +- ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); +- } else { +- ret = lxc_abstract_unix_rcv_credential(sk, NULL, 0); +- if (ret < 0) +- return syserror("Failed to receive notification from parent process"); +- +- TRACE("Child process informed us that cgroup has been created"); +- +- target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); +- if (target_fd0 < 0) +- return syserror("Failed to open \".lxc/cgroup.procs\""); +- +- target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); +- if (target_fd1 < 0) +- return syserror("Failed to open \".lxc/cgroup.procs\""); +- +- TRACE("Opened target cgroup file descriptors %d and %d", target_fd0, target_fd1); +- } +- +- pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); +- +- ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len); +- if (ret > 0 && (size_t)ret == pidstr_len) +- return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0); +- +- ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len); +- if (ret > 0 && (size_t)ret == pidstr_len) +- return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1); +- +- return syserror("Failed to move process into target cgroup via fd %d and %d", target_fd0, target_fd1); +-} +- +-struct userns_exec_unified_attach_data { +- const struct lxc_conf *conf; +- const char *lxcpath; +- int unified_fd; +- int sk_pair[2]; +- pid_t pid; +- bool unprivileged; +-}; +- +-static int cgroup_unified_attach_child_wrapper(void *data) +-{ +- struct userns_exec_unified_attach_data *args = data; +- +- if (!args->conf || !args->lxcpath || args->unified_fd < 0 || +- args->pid <= 0 || args->sk_pair[0] < 0 || args->sk_pair[1] < 0) +- return ret_errno(EINVAL); +- +- close_prot_errno_disarm(args->sk_pair[0]); +- return cgroup_attach_create_leaf(args->conf, args->unified_fd, +- &args->sk_pair[1], args->unprivileged); +-} +- +-static int cgroup_unified_attach_parent_wrapper(void *data) +-{ +- struct userns_exec_unified_attach_data *args = data; +- +- if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || +- args->sk_pair[0] < 0 || args->sk_pair[1] < 0) +- return ret_errno(EINVAL); +- +- close_prot_errno_disarm(args->sk_pair[1]); +- return cgroup_attach_move_into_leaf(args->conf, args->lxcpath, +- args->unified_fd, +- &args->sk_pair[0], args->pid, +- args->unprivileged); +-} +- +-/* Technically, we're always at a delegation boundary here (This is especially +- * true when cgroup namespaces are available.). The reasoning is that in order +- * for us to have been able to start a container in the first place the root +- * cgroup must have been a leaf node. Now, either the container's init system +- * has populated the cgroup and kept it as a leaf node or it has created +- * subtrees. In the former case we will simply attach to the leaf node we +- * created when we started the container in the latter case we create our own +- * cgroup for the attaching process. +- */ +-static int __cg_unified_attach(const struct hierarchy *h, +- const struct lxc_conf *conf, const char *name, +- const char *lxcpath, pid_t pid, +- const char *controller) +-{ +- __do_close int unified_fd = -EBADF; +- __do_free char *path = NULL, *cgroup = NULL; +- int ret; +- +- if (!conf || !name || !lxcpath || pid <= 0) +- return ret_errno(EINVAL); +- +- ret = cgroup_attach(conf, name, lxcpath, pid); +- if (ret == 0) +- return log_trace(0, "Attached to unified cgroup via command handler"); +- TRACE("__cg_unified_attach: cgroup_attach returned %d", ret); +- if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2) +- return log_error_errno(ret, errno, "Failed to attach to unified cgroup"); +- +- /* Fall back to retrieving the path for the unified cgroup. */ +- cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller); +- /* not running */ +- if (!cgroup) +- return 0; +- TRACE("lxc_cmd_get_cgroup_path returned %s", cgroup); +- +- path = make_cgroup_path(h, cgroup, NULL); +- +- unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC); +- if (unified_fd < 0) +- return ret_errno(EBADF); +- +- if (!list_empty(&conf->id_map)) { +- struct userns_exec_unified_attach_data args = { +- .conf = conf, +- .unified_fd = unified_fd, +- .pid = pid, +- .unprivileged = am_guest_unpriv(), +- .lxcpath = lxcpath, +- }; +- +- ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); +- if (ret < 0) +- return -errno; +- +- ret = userns_exec_minimal(conf, +- cgroup_unified_attach_parent_wrapper, +- &args, +- cgroup_unified_attach_child_wrapper, +- &args); +- } else { +- ret = cgroup_attach_leaf(conf, unified_fd, pid); +- } +- +- return ret; +-} +- +-__cgfsng_ops static bool isulad_cgfsng_attach(struct cgroup_ops *ops, +- const struct lxc_conf *conf, +- const char *name, const char *lxcpath, +- pid_t pid) +-{ +- int len, ret; +- char pidstr[INTTYPE_TO_STRLEN(pid_t)]; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +-#ifdef HAVE_ISULAD +- if (ops->no_controller) { +- DEBUG("no controller found, isgnore isulad_cgfsng_attach"); +- return true; +- } +-#endif +- +- if (!ops->hierarchies) +- return true; +- +- len = snprintf(pidstr, sizeof(pidstr), "%d", pid); +- if (len < 0 || (size_t)len >= sizeof(pidstr)) +- return false; +- +- for (int i = 0; ops->hierarchies[i]; i++) { +- __do_free char *fullpath = NULL, *path = NULL; +- struct hierarchy *h = ops->hierarchies[i]; +- +- if (h->fs_type == CGROUP2_SUPER_MAGIC) { +- ret = __cg_unified_attach(h, conf, name, lxcpath, pid, +- h->controllers[0]); +- if (ret < 0) +- return false; +- +- continue; +- } +- +- path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); +- /* not running */ +- if (!path) +- return false; +- +- fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs"); +- ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to attach %d to %s", +- (int)pid, fullpath); +- } +- +- return true; +-} +- +-__cgfsng_ops static int isulad_cgfsng_get(struct cgroup_ops *ops, const char *filename, +- char *value, size_t len, const char *name, +- const char *lxcpath) +-{ +- int ret = -1; +- size_t controller_len; +- char *controller, *p, *path; +- struct hierarchy *h; +- +- controller_len = strlen(filename); +- controller = alloca(controller_len + 1); +- (void)strlcpy(controller, filename, controller_len + 1); +- +- p = strchr(controller, '.'); +- if (p) +- *p = '\0'; +- +- const char *ori_path = ops->get_cgroup(ops, controller); +- if (ori_path == NULL) { +- ERROR("Failed to get cgroup path:%s", controller); +- return -1; +- } +- path = safe_strdup(ori_path); +- +- h = get_hierarchy(ops, controller); +- if (h) { +- char *fullpath; +- +- fullpath = build_full_cgpath_from_monitorpath(h, path, filename); +- ret = lxc_read_from_file(fullpath, value, len); +- free(fullpath); +- } +- free(path); +- +- return ret; +-} +- +-static int device_cgroup_parse_access(struct device_item *device, const char *val) +-{ +- for (int count = 0; count < 3; count++, val++) { +- switch (*val) { +- case 'r': +- device->access[count] = *val; +- break; +- case 'w': +- device->access[count] = *val; +- break; +- case 'm': +- device->access[count] = *val; +- break; +- case '\n': +- case '\0': +- count = 3; +- break; +- default: +- return ret_errno(EINVAL); +- } +- } +- +- return 0; +-} +- +-static int device_cgroup_rule_parse(struct device_item *device, const char *key, +- const char *val) +-{ +- size_t count; +- int ret; +- char temp[50]; +- +- if (strequal("devices.allow", key)) +- device->allow = 1; /* allow the device */ +- else +- device->allow = 0; /* deny the device */ +- +- if (strequal(val, "a")) { +- /* global rule */ +- device->type = 'a'; +- device->major = -1; +- device->minor = -1; +- return 0; +- } +- +- switch (*val) { +- case 'a': +- __fallthrough; +- case 'b': +- __fallthrough; +- case 'c': +- device->type = *val; +- break; +- default: +- return -1; +- } +- +- val++; +- if (!isspace(*val)) +- return -1; +- val++; +- if (*val == '*') { +- device->major = -1; +- val++; +- } else if (isdigit(*val)) { +- memset(temp, 0, sizeof(temp)); +- for (count = 0; count < sizeof(temp) - 1; count++) { +- temp[count] = *val; +- val++; +- if (!isdigit(*val)) +- break; +- } +- ret = lxc_safe_int(temp, &device->major); +- if (ret) +- return -1; +- } else { +- return -1; +- } +- if (*val != ':') +- return -1; +- val++; +- +- /* read minor */ +- if (*val == '*') { +- device->minor = -1; +- val++; +- } else if (isdigit(*val)) { +- memset(temp, 0, sizeof(temp)); +- for (count = 0; count < sizeof(temp) - 1; count++) { +- temp[count] = *val; +- val++; +- if (!isdigit(*val)) +- break; +- } +- ret = lxc_safe_int(temp, &device->minor); +- if (ret) +- return -1; +- } else { +- return -1; +- } +- if (!isspace(*val)) +- return -1; +- +- return device_cgroup_parse_access(device, ++val); +-} +- +-__cgfsng_ops static int isulad_cgfsng_set(struct cgroup_ops *ops, +- const char *filename, const char *value, +- const char *name, const char *lxcpath) +-{ +- int ret = -1; +- size_t controller_len; +- char *controller, *p, *path; +- struct hierarchy *h; +- +- controller_len = strlen(filename); +- controller = alloca(controller_len + 1); +- (void)strlcpy(controller, filename, controller_len + 1); +- +- p = strchr(controller, '.'); +- if (p) +- *p = '\0'; +- +- const char *ori_path = ops->get_cgroup(ops, controller); +- if (ori_path == NULL) { +- ERROR("Failed to get cgroup path:%s", controller); +- return -1; +- } +- path = safe_strdup(ori_path); +- +- h = get_hierarchy(ops, controller); +- if (h) { +- char *fullpath; +- fullpath = build_full_cgpath_from_monitorpath(h, path, filename); +- +- if (strcmp(filename, "io.weight") == 0 || strcmp(filename, "io.bfq.weight") == 0) { +- if (!file_exists(fullpath)) { +- free(path); +- free(fullpath); +- return 0; +- } +- } +- +- ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); +- free(fullpath); +- } +- free(path); +- +- return ret; +-} +- +-/* take devices cgroup line +- * /dev/foo rwx +- * and convert it to a valid +- * type major:minor mode +- * line. Return <0 on error. Dest is a preallocated buffer long enough to hold +- * the output. +- */ +-static int device_cgroup_rule_parse_devpath(struct device_item *device, +- const char *devpath) +-{ +- __do_free char *path = NULL; +- char *mode = NULL; +- int n_parts, ret; +- char *p; +- struct stat sb; +- +- path = strdup(devpath); +- if (!path) +- return ret_errno(ENOMEM); +- +- /* +- * Read path followed by mode. Ignore any trailing text. +- * A ' # comment' would be legal. Technically other text is not +- * legal, we could check for that if we cared to. +- */ +- for (n_parts = 1, p = path; *p; p++) { +- if (*p != ' ') +- continue; +- *p = '\0'; +- +- if (n_parts != 1) +- break; +- p++; +- n_parts++; +- +- while (*p == ' ') +- p++; +- +- mode = p; +- +- if (*p == '\0') +- return ret_set_errno(-1, EINVAL); +- } +- +- if (device_cgroup_parse_access(device, mode) < 0) +- return -1; +- +- ret = stat(path, &sb); +- if (ret < 0) +- return ret_set_errno(-1, errno); +- +- mode_t m = sb.st_mode & S_IFMT; +- switch (m) { +- case S_IFBLK: +- device->type = 'b'; +- break; +- case S_IFCHR: +- device->type = 'c'; +- break; +- default: +- return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path); +- } +- +- device->major = MAJOR(sb.st_rdev); +- device->minor = MINOR(sb.st_rdev); +- device->allow = 1; +- +- return 0; +-} +- +-static int convert_devpath(const char *invalue, char *dest) +-{ +- struct device_item device = {0}; +- int ret; +- +- ret = device_cgroup_rule_parse_devpath(&device, invalue); +- if (ret < 0) +- return -1; +- +- ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major, +- device.minor, device.access); +- if (ret < 0 || ret >= 50) +- return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)", +- device.type, device.major, device.minor, device.access); +- +- return 0; +-} +- +-/* Called from setup_limits - here we have the container's cgroup_data because +- * we created the cgroups. +- */ +-static int isulad_cg_legacy_get_data(struct cgroup_ops *ops, const char *filename, +- char *value, size_t len) +-{ +- char *fullpath = NULL; +- char *p = NULL; +- struct hierarchy *h = NULL; +- int ret = 0; +- char *controller = NULL; +- +- len = strlen(filename); +- if (SIZE_MAX - 1 < len) { +- errno = EINVAL; +- return -1; +- } +- controller = calloc(1, len + 1); +- if (controller == NULL) { +- errno = ENOMEM; +- return -1; +- } +- (void)strlcpy(controller, filename, len + 1); +- +- p = strchr(controller, '.'); +- if (p) +- *p = '\0'; +- +- +- h = get_hierarchy(ops, controller); +- if (!h) { +- ERROR("Failed to setup limits for the \"%s\" controller. " +- "The controller seems to be unused by \"cgfsng\" cgroup " +- "driver or not enabled on the cgroup hierarchy", +- controller); +- errno = ENOENT; +- free(controller); +- return -ENOENT; +- } +- +- fullpath = must_make_path(h->path_con, filename, NULL); +- ret = lxc_read_from_file(fullpath, value, len); +- free(fullpath); +- free(controller); +- return ret; +-} +- +-static int isulad_cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, +- const char *value) +-{ +- size_t len; +- char *fullpath, *p; +- /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ +- char converted_value[50]; +- struct hierarchy *h; +- int ret = 0; +- char *controller = NULL; +- int retry_count = 0; +- int max_retry = 10; +- char *container_cgroup = ops->container_cgroup; +- +- len = strlen(filename); +- controller = alloca(len + 1); +- (void)strlcpy(controller, filename, len + 1); +- +- p = strchr(controller, '.'); +- if (p) +- *p = '\0'; +- +- if (strcmp("devices.allow", filename) == 0 && value[0] == '/') { +- ret = convert_devpath(value, converted_value); +- if (ret < 0) +- return ret; +- value = converted_value; +- } +- +- h = get_hierarchy(ops, controller); +- if (!h) { +- ERROR("Failed to setup limits for the \"%s\" controller. " +- "The controller seems to be unused by \"cgfsng\" cgroup " +- "driver or not enabled on the cgroup hierarchy", +- controller); +- errno = ENOENT; +- return -ENOENT; +- } +- +- fullpath = must_make_path(h->path_con, filename, NULL); +- +-retry: +- ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); +- if (ret != 0) { +- if (retry_count < max_retry) { +- SYSERROR("setting cgroup config for ready process caused \"failed to write %s to %s\".", value, fullpath); +- (void)isulad_cg_legacy_handle_cpuset_hierarchy(h, container_cgroup); +- (void)isulad_mkdir_eexist_on_last(h->path_con, 0755); +- usleep(100 * 1000); /* 100 millisecond */ +- retry_count++; +- goto retry; +- } +- lxc_write_error_message(ops->errfd, +- "%s:%d: setting cgroup config for ready process caused failed to write %s to %s: %s", +- __FILE__, __LINE__, value, fullpath, strerror(errno)); +- } +- free(fullpath); +- return ret; +-} +- +-/* +- * Return the list of cgroup_settings sorted according to the following rules +- * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes +- */ +-static void sort_cgroup_settings(struct lxc_conf *conf) +-{ +- LIST_HEAD(memsw_list); +- struct lxc_cgroup *cgroup, *ncgroup; +- +- /* Iterate over the cgroup settings and copy them to the output list. */ +- list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) { +- if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) +- continue; +- +- /* Move the memsw entry from the cgroup settings list. */ +- list_move_tail(&cgroup->head, &memsw_list); +- } +- +- /* +- * Append all the memsw entries to the end of the cgroup settings list +- * to make sure they are applied after all memory limit settings. +- */ +- list_splice_tail(&memsw_list, &conf->cgroup); +- +-} +- +-__cgfsng_ops static bool isulad_cgfsng_setup_limits_legacy(struct cgroup_ops *ops, +- struct lxc_conf *conf, +- bool do_devices) +-{ +- struct list_head *cgroup_settings; +- struct lxc_cgroup *cgroup; +- char value[21 + 1] = { 0 }; +- long long int readvalue, setvalue; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!conf) +- return ret_set_errno(false, EINVAL); +- +- cgroup_settings = &conf->cgroup; +- if (list_empty(cgroup_settings)) +- return true; +- +- if (!ops->hierarchies) +- return ret_set_errno(false, EINVAL); +- +- if (pure_unified_layout(ops)) +- return true; +- +- sort_cgroup_settings(conf); +- list_for_each_entry(cgroup, cgroup_settings, head) { +- if (do_devices == strnequal("devices", cgroup->subsystem, 7)) { +- const char *cgvalue = cgroup->value; +- if (strcmp(cgroup->subsystem, "files.limit") == 0) { +- if (lxc_safe_long_long(cgvalue, &setvalue) != 0) { +- SYSERROR("Invalid integer value %s", cgvalue); +- return false; +- } +- if (setvalue <= 0) { +- cgvalue = "max"; +- } +- } +- if (isulad_cg_legacy_set_data(ops, cgroup->subsystem, cgvalue)) { +- if (do_devices && (errno == EACCES || errno == EPERM)) { +- SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); +- continue; +- } +- SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); +- return false; +- } +- DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgvalue); +- } +- +- // isulad: check cpu shares +- if (strcmp(cgroup->subsystem, "cpu.shares") == 0) { +- if (isulad_cg_legacy_get_data(ops, cgroup->subsystem, value, sizeof(value) - 1) < 0) { +- SYSERROR("Error get %s", cgroup->subsystem); +- return false; +- } +- trim(value); +- if (lxc_safe_long_long(cgroup->value, &setvalue) != 0) { +- SYSERROR("Invalid value %s", cgroup->value); +- return false; +- } +- if (lxc_safe_long_long(value, &readvalue) != 0) { +- SYSERROR("Invalid value %s", value); +- return false; +- } +- if (setvalue > readvalue) { +- ERROR("The maximum allowed cpu-shares is %s", value); +- lxc_write_error_message(ops->errfd, +- "%s:%d: setting cgroup config for ready process caused \"The maximum allowed cpu-shares is %s\".", +- __FILE__, __LINE__, value); +- return false; +- } else if (setvalue < readvalue) { +- ERROR("The minimum allowed cpu-shares is %s", value); +- lxc_write_error_message(ops->errfd, +- "%s:%d: setting cgroup config for ready process caused \"The minimum allowed cpu-shares is %s\".", +- __FILE__, __LINE__, value); +- return false; +- } +- } +- } +- +- INFO("Limits for the legacy cgroup hierarchies have been setup"); +- return true; +-} +- +-/* +- * Some of the parsing logic comes from the original cgroup device v1 +- * implementation in the kernel. +- */ +-static int bpf_device_cgroup_prepare(struct cgroup_ops *ops, +- struct lxc_conf *conf, const char *key, +- const char *val) +-{ +- struct device_item device_item = {}; +- int ret; +- +- if (strequal("devices.allow", key) && abspath(val)) +- ret = device_cgroup_rule_parse_devpath(&device_item, val); +- else +- ret = device_cgroup_rule_parse(&device_item, key, val); +- if (ret < 0) +- return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val); +- +- /* +- * Note that bpf_list_add_device() returns 1 if it altered the device +- * list and 0 if it didn't; both return values indicate success. +- * Only a negative return value indicates an error. +- */ +- ret = bpf_list_add_device(&conf->bpf_devices, &device_item); +- if (ret < 0) +- return -1; +- +- return 0; +-} +-__cgfsng_ops static bool isulad_cgfsng_setup_limits(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +- __do_free char *path = NULL; +- struct list_head *cgroup_settings; +- struct hierarchy *h; +- struct lxc_conf *conf; +- struct lxc_cgroup *cg; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return true; +- +- if (!ops->container_cgroup) +- return ret_set_errno(false, EINVAL); +- +- if (!handler || !handler->conf) +- return ret_set_errno(false, EINVAL); +- conf = handler->conf; +- +- if (list_empty(&conf->cgroup2)) +- return true; +- cgroup_settings = &conf->cgroup2; +- +- if (!pure_unified_layout(ops)) +- return true; +- +- if (!ops->unified) +- return false; +- h = ops->unified; +- +- list_for_each_entry(cg, cgroup_settings, head) { +- int ret; +- +- if (strncmp("devices", cg->subsystem, 7) == 0) { +- ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, +- cg->value); +- } else if (strcmp(cg->subsystem, "files.limit") == 0) { +- long long int setvalue = 0; +- const char *cgvalue = cg->value; +- +- if (lxc_safe_long_long(cgvalue, &setvalue) != 0) +- return log_error(false, "Invalid integer value %s", cgvalue); +- +- if (setvalue <= 0) +- cgvalue = "max"; +- +- ret = lxc_write_openat(h->path_con, +- cg->subsystem, cgvalue, +- strlen(cgvalue)); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", +- cg->subsystem, cgvalue); +- } else { +- if (strcmp(cg->subsystem, "io.weight") == 0 || strcmp(cg->subsystem, "io.bfq.weight") == 0) { +- path = must_make_path(h->path_con, cg->subsystem, NULL); +- if (!file_exists(path)) { +- continue; +- } +- } +- ret = lxc_write_openat(h->path_con, +- cg->subsystem, cg->value, +- strlen(cg->value)); +- if (ret < 0) +- return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", +- cg->subsystem, cg->value); +- } +- TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value); +- } +- +- return log_info(true, "Limits for the unified cgroup hierarchy have been setup"); +-} +- +-__cgfsng_ops bool isulad_cgfsng_devices_activate(struct cgroup_ops *ops, +- struct lxc_handler *handler) +-{ +-#ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX +- __do_bpf_program_free struct bpf_program *devices = NULL; +- int ret; +- struct lxc_conf *conf; +- struct hierarchy *unified; +- struct lxc_list *it; +- struct bpf_program *devices_old; +- +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +- if (!ops->hierarchies) +- return true; +- +- if (!ops->container_cgroup) +- return ret_set_errno(false, EEXIST); +- +- if (!handler || !handler->conf) +- return ret_set_errno(false, EINVAL); +- conf = handler->conf; +- +- unified = ops->unified; +- if (!unified || !unified->bpf_device_controller || +- !unified->path_con || lxc_list_empty(&conf->devices)) +- return true; +- +- devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE); +- if (!devices) +- return log_error_errno(false, ENOMEM, "Failed to create new bpf program"); +- +- ret = bpf_program_init(devices); +- if (ret) +- return log_error_errno(false, ENOMEM, "Failed to initialize bpf program"); +- +- lxc_list_for_each(it, &conf->devices) { +- struct device_item *cur = it->elem; +- +- ret = bpf_program_append_device(devices, cur); +- if (ret) +- return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d", +- cur->type, +- cur->major, +- cur->minor, +- cur->access, +- cur->allow, +- cur->global_rule); +- TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d", +- cur->type, +- cur->major, +- cur->minor, +- cur->access, +- cur->allow, +- cur->global_rule); +- } +- +- ret = bpf_program_finalize(devices); +- if (ret) +- return log_error_errno(false, ENOMEM, "Failed to finalize bpf program"); +- +- ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE, +- unified->path_con, +- BPF_F_ALLOW_MULTI); +- if (ret) +- return log_error_errno(false, ENOMEM, "Failed to attach bpf program"); +- +- /* Replace old bpf program. */ +- devices_old = move_ptr(conf->cgroup2_devices); +- conf->cgroup2_devices = move_ptr(devices); +- devices = move_ptr(devices_old); +-#endif +- return true; +-} +- +-bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup) +-{ +- __do_free char *add_controllers = NULL, *base_path = NULL; +- __do_free_string_list char **parts = NULL; +- struct hierarchy *unified = ops->unified; +- ssize_t parts_len; +- char **it; +- size_t full_len = 0; +- +- if (!ops->hierarchies || !pure_unified_layout(ops) || +- !unified->controllers[0]) +- return true; +- +- /* For now we simply enable all controllers that we have detected by +- * creating a string like "+memory +pids +cpu +io". +- * TODO: In the near future we might want to support "-" +- * etc. but whether supporting semantics like this make sense will need +- * some thinking. +- */ +- for (it = unified->controllers; it && *it; it++) { +- full_len += strlen(*it) + 2; +- add_controllers = must_realloc(add_controllers, full_len + 1); +- +- if (unified->controllers[0] == *it) +- add_controllers[0] = '\0'; +- +- (void)strlcat(add_controllers, "+", full_len + 1); +- (void)strlcat(add_controllers, *it, full_len + 1); +- +- if (*(it + 1)) +- (void)strlcat(add_controllers, " ", full_len + 1); +- } +- +- parts = lxc_string_split(cgroup, '/'); +- if (!parts) +- return false; +- +- parts_len = lxc_array_len((void **)parts); +- if (parts_len > 0) +- parts_len--; +- +- base_path = must_make_path(unified->at_mnt, unified->at_base, NULL); +- for (ssize_t i = -1; i < parts_len; i++) { +- int ret; +- __do_free char *target = NULL; +- +- if (i >= 0) +- base_path = must_append_path(base_path, parts[i], NULL); +- target = must_make_path(base_path, "cgroup.subtree_control", NULL); +- ret = lxc_writeat(-1, target, add_controllers, full_len); +- if (ret < 0) +- return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"", +- add_controllers, target); +- TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target); +- } +- +- return true; +-} +- +-__cgfsng_ops bool isulad_cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops) +-{ +- return true; +-} +- +-__cgfsng_ops bool isulad_cgfsng_payload_delegate_controllers(struct cgroup_ops *ops) +-{ +- if (!ops) +- return ret_set_errno(false, ENOENT); +- +-#ifdef HAVE_ISULAD +- if (ops->no_controller) { +- DEBUG("no controller found, isgnore isulad_cgfsng_payload_delegate_controllers"); +- return true; +- } +-#endif +- +- return __cgfsng_delegate_controllers(ops, ops->container_cgroup); +-} +- +-static inline bool unified_cgroup(const char *line) +-{ +- return *line == '0'; +-} +- +-static inline char *current_unified_cgroup(bool relative, char *line) +-{ +- char *current_cgroup; +- +- line += STRLITERALLEN("0::"); +- +- if (!abspath(line)) +- return ERR_PTR(-EINVAL); +- +- /* remove init.scope */ +- if (!relative) +- line = prune_init_scope(line); +- +- /* create a relative path */ +- line = deabs(line); +- +- current_cgroup = strdup(line); +- if (!current_cgroup) +- return ERR_PTR(-ENOMEM); +- +- return current_cgroup; +-} +- +-static inline const char *unprefix(const char *controllers) +-{ +- if (strnequal(controllers, "name=", STRLITERALLEN("name="))) +- return controllers + STRLITERALLEN("name="); +- return controllers; +-} +- +-static int __list_cgroup_delegate(char ***delegate) +-{ +- __do_free char **list = NULL; +- __do_free char *buf = NULL; +- char *standard[] = { +- "cgroup.procs", +- "cgroup.threads", +- "cgroup.subtree_control", +- "memory.oom.group", +- NULL, +- }; +- char *token; +- int ret; +- +- buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0); +- if (!buf) { +- for (char **p = standard; p && *p; p++) { +- ret = list_add_string(&list, *p); +- if (ret < 0) +- return ret; +- } +- +- *delegate = move_ptr(list); +- return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate"); +- } +- +- lxc_iterate_parts(token, buf, " \t\n") { +- /* +- * We always need to chown this for both cgroup and +- * cgroup2. +- */ +- if (strequal(token, "cgroup.procs")) +- continue; +- +- ret = list_add_string(&list, token); +- if (ret < 0) +- return ret; +- } +- +- *delegate = move_ptr(list); +- return 0; +-} +- +-static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files) +-{ +- __do_free_string_list char **list = NULL; +- int ret; +- +- ret = __list_cgroup_delegate(&list); +- if (ret < 0) +- return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements"); +- +- for (char *const *s = list; s && *s; s++) { +- if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT) +- continue; +- +- return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s); +- } +- +- *ret_files = move_ptr(list); +- return true; +-} +- +-static bool legacy_hierarchy_delegated(int dfd_base) +-{ +- int ret; +- +- ret = faccessat(dfd_base, ".", W_OK, 0); +- if (ret < 0 && errno != ENOENT) +- return sysinfo_ret(false, "Legacy hierarchy not writable, skipping"); +- +- return true; +-} +- +-/** +- * systemd guarantees that the order of co-mounted controllers is stable. On +- * some systems the order of the controllers might be reversed though. +- * +- * For example, this is how the order is mismatched on CentOS 7: +- * +- * [root@localhost ~]# cat /proc/self/cgroup +- * 11:perf_event:/ +- * 10:pids:/ +- * 9:freezer:/ +- * >>>> 8:cpuacct,cpu:/ +- * 7:memory:/ +- * 6:blkio:/ +- * 5:devices:/ +- * 4:hugetlb:/ +- * >>>> 3:net_prio,net_cls:/ +- * 2:cpuset:/ +- * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope +- * +- * whereas the mountpoint: +- * +- * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755 +- * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +- * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset +- * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls +- * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb +- * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices +- * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio +- * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory +- * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu +- * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer +- * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids +- * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event +- * +- * Ensure that we always use the systemd-guaranteed stable order when checking +- * for the mountpoint. +- */ +-#if HAVE_COMPILER_ATTR_NONNULL +-__attribute__((nonnull)) +-#endif +-#if HAVE_COMPILER_ATTR_RETURNS_NONNULL +-__attribute__((returns_nonnull)) +-#endif +-static const char *stable_order(const char *controllers) +-{ +- if (strequal(controllers, "cpuacct,cpu")) +- return "cpu,cpuacct"; +- +- if (strequal(controllers, "net_prio,net_cls")) +- return "net_cls,net_prio"; +- +- return unprefix(controllers); +-} +- +-#define CGFSNG_LAYOUT_LEGACY BIT(0) +-#define CGFSNG_LAYOUT_UNIFIED BIT(1) +- +-static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, +- bool unprivileged, struct lxc_conf *conf) +-{ +- __do_free char *cgroup_info = NULL; +- unsigned int layout_mask = 0; +- int ret; +- char *it; +- +- ret = unpriv_systemd_create_scope(ops, conf); +- if (ret < 0) +- return ret_set_errno(false, ret); +- else if (ret == 0) +- TRACE("Entered an unpriv systemd scope"); +- +- /* +- * Root spawned containers escape the current cgroup, so use init's +- * cgroups as our base in that case. +- */ +- if (!relative && (geteuid() == 0)) +- cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); +- else +- cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); +- if (!cgroup_info) +- return ret_errno(ENOMEM); +- +- lxc_iterate_parts(it, cgroup_info, "\n") { +- __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; +- __do_free char *controllers = NULL, *current_cgroup = NULL; +- __do_free_string_list char **controller_list = NULL, +- **delegate = NULL; +- char *line; +- int dfd, type; +- +- /* Handle the unified cgroup hierarchy. */ +- line = it; +- if (unified_cgroup(line)) { +- char *unified_mnt; +- +- type = UNIFIED_HIERARCHY; +- layout_mask |= CGFSNG_LAYOUT_UNIFIED; +- +- if (conf->cgroup_meta.systemd_scope) +- current_cgroup = cgroup_relpath(conf->cgroup_meta.systemd_scope); +- if (IS_ERR_OR_NULL(current_cgroup)) +- current_cgroup = current_unified_cgroup(relative, line); +- if (IS_ERR(current_cgroup)) +- return PTR_ERR(current_cgroup); +- +- if (unified_cgroup_fd(ops->dfd_mnt)) { +- dfd_mnt = dup_cloexec(ops->dfd_mnt); +- unified_mnt = ""; +- } else { +- dfd_mnt = open_at(ops->dfd_mnt, +- "unified", +- PROTECT_OPATH_DIRECTORY, +- PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); +- unified_mnt = "unified"; +- } +- if (dfd_mnt < 0) { +- if (errno != ENOENT) +- return syserror("Failed to open %d/unified", ops->dfd_mnt); +- +- SYSTRACE("Unified cgroup not mounted"); +- continue; +- } +- +- if (!fhas_fs_type(dfd_mnt, CGROUP2_SUPER_MAGIC)) { +- SYSTRACE("Opened file descriptor %d is not a cgroup2 mountpoint", dfd_mnt); +- continue; +- } +- +- dfd = dfd_mnt; +- +- if (!is_empty_string(current_cgroup)) { +- dfd_base = open_at(dfd_mnt, current_cgroup, +- PROTECT_OPATH_DIRECTORY, +- PROTECT_LOOKUP_BENEATH_XDEV, 0); +- if (dfd_base < 0) { +- if (errno != ENOENT) +- return syserror("Failed to open %d/%s", +- dfd_mnt, current_cgroup); +- +- SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", +- dfd_mnt, current_cgroup); +- continue; +- } +- dfd = dfd_base; +- } +- +- if (!unified_hierarchy_delegated(dfd, &delegate)) +- continue; +- +- controller_list = unified_controllers(dfd, "cgroup.controllers"); +- if (!controller_list) { +- TRACE("No controllers are enabled for delegation in the unified hierarchy"); +- controller_list = list_new(); +- if (!controller_list) +- return syserror_set(-ENOMEM, "Failed to create empty controller list"); +- } +- +- controllers = strdup(unified_mnt); +- if (!controllers) +- return ret_errno(ENOMEM); +- } else { +- char *__controllers, *__current_cgroup; +- +- type = LEGACY_HIERARCHY; +- layout_mask |= CGFSNG_LAYOUT_LEGACY; +- +- __controllers = strchr(line, ':'); +- if (!__controllers) +- return ret_errno(EINVAL); +- __controllers++; +- +- __current_cgroup = strchr(__controllers, ':'); +- if (!__current_cgroup) +- return ret_errno(EINVAL); +- *__current_cgroup = '\0'; +- __current_cgroup++; +- +- controllers = strdup(stable_order(__controllers)); +- if (!controllers) +- return ret_errno(ENOMEM); +- +- dfd_mnt = open_at(ops->dfd_mnt, +- controllers, +- PROTECT_OPATH_DIRECTORY, +- PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); +- if (dfd_mnt < 0) { +- if (errno != ENOENT) +- return syserror("Failed to open %d/%s", +- ops->dfd_mnt, controllers); +- +- SYSTRACE("%s not mounted", controllers); +- continue; +- } +- +- if (!fhas_fs_type(dfd_mnt, CGROUP_SUPER_MAGIC)) { +- SYSTRACE("Opened file descriptor %d is not a cgroup mountpoint", dfd_mnt); +- continue; +- } +- +- dfd = dfd_mnt; +- +- if (!abspath(__current_cgroup)) +- return ret_errno(EINVAL); +- +- /* remove init.scope */ +- if (!relative) +- __current_cgroup = prune_init_scope(__current_cgroup); +- +- /* create a relative path */ +- __current_cgroup = deabs(__current_cgroup); +- +- current_cgroup = strdup(__current_cgroup); +- if (!current_cgroup) +- return ret_errno(ENOMEM); +- +- if (!is_empty_string(current_cgroup)) { +- dfd_base = open_at(dfd_mnt, current_cgroup, +- PROTECT_OPATH_DIRECTORY, +- PROTECT_LOOKUP_BENEATH_XDEV, 0); +- if (dfd_base < 0) { +- if (errno != ENOENT) +- return syserror("Failed to open %d/%s", +- dfd_mnt, current_cgroup); +- +- SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", +- dfd_mnt, current_cgroup); +- continue; +- } +- dfd = dfd_base; +- } +- +- if (!legacy_hierarchy_delegated(dfd)) +- continue; +- +- /* +- * We intentionally pass __current_cgroup here and not +- * controllers because we would otherwise chop the +- * mountpoint. +- */ +- controller_list = list_add_controllers(__controllers); +- if (!controller_list) +- return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers); +- +- if (skip_hierarchy(ops, controller_list)) +- continue; +- +- ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; +- } +- +- ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd, +- current_cgroup, controller_list, type); +- if (ret < 0) +- return syserror_ret(ret, "Failed to add %s hierarchy", controllers); +- +- /* Transfer ownership. */ +- move_fd(dfd_mnt); +- move_fd(dfd_base); +- move_ptr(current_cgroup); +- move_ptr(controllers); +- move_ptr(controller_list); +- if (type == UNIFIED_HIERARCHY) +- ops->unified->delegate = move_ptr(delegate); +- } +- +- /* determine cgroup layout */ +- if (ops->unified) { +- if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { +- ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; +- } else { +- if (bpf_devices_cgroup_supported()) +- ops->unified->utilities |= DEVICES_CONTROLLER; +- ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; +- } +- } +- +- /* +- * If we still don't know the cgroup layout at this point it means we +- * have not found any writable cgroup hierarchies. Infer the layout +- * from the layout bitmask we created when parsing the cgroups. +- * +- * Keep the ordering in the switch otherwise the bistmask-based +- * matching won't work. +- */ +- if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) { +- switch (layout_mask) { +- case (CGFSNG_LAYOUT_LEGACY | CGFSNG_LAYOUT_UNIFIED): +- ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; +- break; +- case CGFSNG_LAYOUT_LEGACY: +- ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; +- break; +- case CGFSNG_LAYOUT_UNIFIED: +- ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; +- break; +- } +- } +- +- if (!controllers_available(ops)) +- return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated"); +- +- return 0; +-} +- +-static int isulad_initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) +-{ +- __do_close int dfd = -EBADF; +- int ret; +- const char *controllers_use; +- +- if (ops->dfd_mnt >= 0) +- return ret_errno(EBUSY); +- +- /* +- * I don't see the need for allowing symlinks here. If users want to +- * have their hierarchy available in different locations I strongly +- * suggest bind-mounts. +- */ +- dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT, +- PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); +- if (dfd < 0) +- return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT); +- +- controllers_use = lxc_global_config_value("lxc.cgroup.use"); +- if (controllers_use) { +- __do_free char *dup = NULL; +- char *it; +- +- dup = strdup(controllers_use); +- if (!dup) +- return -errno; +- +- lxc_iterate_parts(it, dup, ",") { +- ret = list_add_string(&ops->cgroup_use, it); +- if (ret < 0) +- return ret; +- } +- } +- +- /* +- * Keep dfd referenced by the cleanup function and actually move the fd +- * once we know the initialization succeeded. So if we fail we clean up +- * the dfd. +- */ +- ops->dfd_mnt = dfd; +- +- ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map), conf); +- if (ret < 0) +- return syserror_ret(ret, "Failed to initialize cgroups"); +- +- /* Transfer ownership to cgroup_ops. */ +- move_fd(dfd); +- return 0; +-} +- +-__cgfsng_ops static int isulad_cgfsng_data_init(struct cgroup_ops *ops, struct lxc_conf *conf) +-{ +- const char *cgroup_pattern; +-#ifdef HAVE_ISULAD +- const char *cgroup_tree; +- __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL; +- size_t len; +-#endif +- +- if (!ops) +- return ret_set_errno(-1, ENOENT); +- +- /* copy system-wide cgroup information */ +- cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern"); +- if (cgroup_pattern && !strequal(cgroup_pattern, "")) { +- ops->cgroup_pattern = strdup(cgroup_pattern); +- if (!ops->cgroup_pattern) +- return ret_errno(ENOMEM); +- } +- +-#ifdef HAVE_ISULAD +- if (conf->cgroup_meta.dir) { +- cgroup_tree = conf->cgroup_meta.dir; +- container_cgroup = must_concat(&len, cgroup_tree, "/", conf->name, NULL); +- } else if (ops->cgroup_pattern) { +- __cgroup_tree = lxc_string_replace("%n", conf->name, ops->cgroup_pattern); +- if (!__cgroup_tree) +- return ret_set_errno(-1, ENOMEM); +- +- cgroup_tree = __cgroup_tree; +- container_cgroup = must_concat(&len, cgroup_tree, NULL); +- } else { +- cgroup_tree = NULL; +- container_cgroup = must_concat(&len, conf->name, NULL); +- } +- if (!container_cgroup) +- return ret_set_errno(-1, ENOMEM); +- +- ops->container_cgroup = move_ptr(container_cgroup); +-#endif +- +- return 0; +-} +- +-struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) +-{ +- __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL; +- +- cgfsng_ops = zalloc(sizeof(struct cgroup_ops)); +- if (!cgfsng_ops) +- return ret_set_errno(NULL, ENOMEM); +- +- cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; +- cgfsng_ops->dfd_mnt = -EBADF; +- +- if (isulad_initialize_cgroups(cgfsng_ops, conf)) +- return NULL; +- +- cgfsng_ops->data_init = isulad_cgfsng_data_init; +- +- cgfsng_ops->errfd = conf ? conf->errpipe[1] : -1; +- cgfsng_ops->get_cgroup_full_path = isulad_cgfsng_get_cgroup_full_path; +- cgfsng_ops->payload_destroy = isulad_cgfsng_payload_destroy; +- cgfsng_ops->monitor_destroy = isulad_cgfsng_monitor_destroy; +- cgfsng_ops->monitor_create = isulad_cgfsng_monitor_create; +- cgfsng_ops->monitor_enter = isulad_cgfsng_monitor_enter; +- cgfsng_ops->monitor_delegate_controllers = isulad_cgfsng_monitor_delegate_controllers; +- cgfsng_ops->payload_delegate_controllers = isulad_cgfsng_payload_delegate_controllers; +- cgfsng_ops->payload_create = isulad_cgfsng_payload_create; +- cgfsng_ops->payload_enter = isulad_cgfsng_payload_enter; +- cgfsng_ops->finalize = isulad_cgfsng_finalize; +- cgfsng_ops->get_cgroup = isulad_cgfsng_get_cgroup; +- cgfsng_ops->get = isulad_cgfsng_get; +- cgfsng_ops->set = isulad_cgfsng_set; +- cgfsng_ops->freeze = isulad_cgfsng_freeze; +- cgfsng_ops->unfreeze = isulad_cgfsng_unfreeze; +- cgfsng_ops->setup_limits_legacy = isulad_cgfsng_setup_limits_legacy; +- cgfsng_ops->setup_limits = isulad_cgfsng_setup_limits; +- cgfsng_ops->driver = "isulad_cgfsng"; +- cgfsng_ops->version = "1.0.0"; +- cgfsng_ops->attach = isulad_cgfsng_attach; +- cgfsng_ops->chown = isulad_cgfsng_chown; +- cgfsng_ops->mount = isulad_cgfsng_mount; +- cgfsng_ops->devices_activate = isulad_cgfsng_devices_activate; +- +- cgfsng_ops->criu_escape = isulad_cgfsng_criu_escape; +- cgfsng_ops->criu_num_hierarchies = isulad_cgfsng_criu_num_hierarchies; +- cgfsng_ops->criu_get_hierarchies = isulad_cgfsng_criu_get_hierarchies; +- +- return move_ptr(cgfsng_ops); +-} +- +-static int __unified_attach_fd(const struct lxc_conf *conf, const char *lxcpath, int fd_unified, pid_t pid) +-{ +- int ret; +- +- if (!list_empty(&conf->id_map)) { +- struct userns_exec_unified_attach_data args = { +- .conf = conf, +- .unified_fd = fd_unified, +- .pid = pid, +- .unprivileged = am_guest_unpriv(), +- .lxcpath = lxcpath, +- }; +- +- ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); +- if (ret < 0) +- return -errno; +- +- ret = userns_exec_minimal(conf, +- cgroup_unified_attach_parent_wrapper, +- &args, +- cgroup_unified_attach_child_wrapper, +- &args); +- } else { +- ret = cgroup_attach_leaf(conf, fd_unified, pid); +- } +- +- return ret; +-} +- +-static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name, +- const char *lxcpath, pid_t pid) +-{ +- call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){}; +- int ret; +- size_t idx; +- ssize_t pidstr_len; +- char pidstr[INTTYPE_TO_STRLEN(pid_t)]; +- +- ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx); +- if (ret < 0) +- return ret_errno(ENOSYS); +- +- if (ctx->fd_len == 0) +- return log_trace(0, "Container runs with unwritable %s cgroup layout", +- cgroup_layout_name(ctx->layout)); +- +- pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid); +- if (pidstr_len < 0) +- return pidstr_len; +- +- for (idx = 0; idx < ctx->fd_len; idx++) { +- int dfd_con = ctx->fd[idx]; +- +- if (unified_cgroup_fd(dfd_con)) +- ret = __unified_attach_fd(conf, lxcpath, dfd_con, pid); +- else +- ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len); +- if (ret) +- return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con); +- else +- TRACE("Attached to cgroup fd %d", dfd_con); +- } +- +- TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout)); +- return 0; +-} +- +-static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name, +- const char *lxcpath, pid_t pid) +-{ +- __do_close int dfd_unified = -EBADF; +- +- if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0) +- return ret_errno(EINVAL); +- +- dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath); +- if (dfd_unified < 0) +- return ret_errno(ENOSYS); +- +- return __unified_attach_fd(conf, lxcpath, dfd_unified, pid); +-} +- +-int cgroup_attach(const struct lxc_conf *conf, const char *name, +- const char *lxcpath, pid_t pid) +-{ +- int ret; +- +- ret = __cgroup_attach_many(conf, name, lxcpath, pid); +- if (ret < 0) { +- if (!ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret; +- +- ret = __cgroup_attach_unified(conf, name, lxcpath, pid); +- if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret_errno(ENOSYS); +- } +- +- return ret; +-} +- +-/* Connects to command socket therefore isn't callable from command handler. */ +-int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len) +-{ +- __do_close int dfd = -EBADF; +- struct cgroup_fd fd = { +- .fd = -EBADF, +- }; +- size_t len_controller; +- int ret; +- +- if (is_empty_string(name) || is_empty_string(lxcpath) || +- is_empty_string(key)) +- return ret_errno(EINVAL); +- +- if ((buf && !len) || (len && !buf)) +- return ret_errno(EINVAL); +- +- len_controller = strcspn(key, "."); +- len_controller++; /* Don't forget the \0 byte. */ +- if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) +- return ret_errno(EINVAL); +- (void)strlcpy(fd.controller, key, len_controller); +- +- ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); +- if (ret < 0) { +- if (!ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret; +- +- dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); +- if (dfd < 0) { +- if (!ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret; +- +- return ret_errno(ENOSYS); +- } +- fd.type = UNIFIED_HIERARCHY; +- fd.fd = move_fd(dfd); +- } +- dfd = move_fd(fd.fd); +- +- TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type)); +- +- if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) +- return ret_errno(EOPNOTSUPP); +- else +- ret = lxc_read_try_buf_at(dfd, key, buf, len); +- +- return ret; +-} +- +-/* Connects to command socket therefore isn't callable from command handler. */ +-int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value) +-{ +- __do_close int dfd = -EBADF; +- struct cgroup_fd fd = { +- .fd = -EBADF, +- }; +- size_t len_controller; +- int ret; +- +- if (is_empty_string(name) || is_empty_string(lxcpath) || +- is_empty_string(key) || is_empty_string(value)) +- return ret_errno(EINVAL); +- +- len_controller = strcspn(key, "."); +- len_controller++; /* Don't forget the \0 byte. */ +- if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) +- return ret_errno(EINVAL); +- (void)strlcpy(fd.controller, key, len_controller); +- +- ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); +- if (ret < 0) { +- if (!ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret; +- +- dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); +- if (dfd < 0) { +- if (!ERRNO_IS_NOT_SUPPORTED(ret)) +- return ret; +- +- return ret_errno(ENOSYS); +- } +- fd.type = UNIFIED_HIERARCHY; +- fd.fd = move_fd(dfd); +- } +- dfd = move_fd(fd.fd); +- +- TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type)); +- +- if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) { +- struct device_item device = {}; +- +- ret = device_cgroup_rule_parse(&device, key, value); +- if (ret < 0) +- return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", +- key, value); +- +- ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); +- } else { +- ret = lxc_writeat(dfd, key, value, strlen(value)); +- } +- +- return ret; +-} +- +-static int do_cgroup_freeze(int unified_fd, +- const char *state_string, +- int state_num, +- int timeout, +- const char *epoll_error, +- const char *wait_error) +-{ +- __do_close int events_fd = -EBADF; +- call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; +- int ret; +- struct lxc_async_descr descr = {}; +- +- if (timeout != 0) { +- ret = lxc_mainloop_open(&descr); +- if (ret) +- return log_error_errno(-1, errno, "%s", epoll_error); +- +- /* automatically cleaned up now */ +- descr_ptr = &descr; +- +- events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0); +- if (events_fd < 0) +- return log_error_errno(-errno, errno, "Failed to open cgroup.events file"); +- +- ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, +- freezer_cgroup_events_cb, +- default_cleanup_handler, +- INT_TO_PTR(state_num), +- "freezer_cgroup_events_cb"); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); +- } +- +- ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1); +- if (ret < 0) +- return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); +- +- if (timeout != 0) { +- ret = lxc_mainloop(&descr, timeout); +- if (ret) +- return log_error_errno(-1, errno, "%s", wait_error); +- } +- +- return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen"); +-} +- +-static inline int __cgroup_freeze(int unified_fd, int timeout) +-{ +- return do_cgroup_freeze(unified_fd, "1", 1, timeout, +- "Failed to create epoll instance to wait for container freeze", +- "Failed to wait for container to be frozen"); +-} +- +-int cgroup_freeze(const char *name, const char *lxcpath, int timeout) +-{ +- __do_close int unified_fd = -EBADF; +- int ret; +- +- if (is_empty_string(name) || is_empty_string(lxcpath)) +- return ret_errno(EINVAL); +- +- unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); +- if (unified_fd < 0) +- return ret_errno(ENOCGROUP2); +- +- lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING); +- ret = __cgroup_freeze(unified_fd, timeout); +- lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING); +- return ret; +-} +- +-int __cgroup_unfreeze(int unified_fd, int timeout) +-{ +- return do_cgroup_freeze(unified_fd, "0", 0, timeout, +- "Failed to create epoll instance to wait for container freeze", +- "Failed to wait for container to be frozen"); +-} +- +-int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout) +-{ +- __do_close int unified_fd = -EBADF; +- int ret; +- +- if (is_empty_string(name) || is_empty_string(lxcpath)) +- return ret_errno(EINVAL); +- +- unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); +- if (unified_fd < 0) +- return ret_errno(ENOCGROUP2); +- +- lxc_cmd_notify_state_listeners(name, lxcpath, THAWED); +- ret = __cgroup_unfreeze(unified_fd, timeout); +- lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN); +- return ret; +-} +-- +2.25.1 + diff --git a/0007-fix-run-container-failed-when-enable-isulad.patch b/0007-fix-run-container-failed-when-enable-isulad.patch new file mode 100644 index 0000000..7484ab8 --- /dev/null +++ b/0007-fix-run-container-failed-when-enable-isulad.patch @@ -0,0 +1,1060 @@ +From d743d299c37b71d0990d0a68ad492a0ec76a7886 Mon Sep 17 00:00:00 2001 +From: zhangxiaoyu +Date: Wed, 18 Oct 2023 11:01:49 +0800 +Subject: [PATCH] fix run container failed when enable isulad + +Signed-off-by: zhangxiaoyu +--- + src/lxc/attach.c | 11 +- + src/lxc/cgroups/cgfsng.c | 581 +++++++++++++++++++++++++++++++++++++++ + src/lxc/cgroups/cgroup.h | 4 + + src/lxc/conf.c | 17 +- + src/lxc/confile.c | 34 ++- + src/lxc/meson.build | 4 +- + src/lxc/start.c | 11 +- + 7 files changed, 628 insertions(+), 34 deletions(-) + +diff --git a/src/lxc/attach.c b/src/lxc/attach.c +index 066eb5c..ae12da3 100644 +--- a/src/lxc/attach.c ++++ b/src/lxc/attach.c +@@ -1454,18 +1454,17 @@ __noreturn static void do_attach(struct attach_payload *ap) + goto on_error; + } + } else { +-#else +- ret = lxc_terminal_prepare_login(ap->terminal_pts_fd); +-#endif +-#ifdef HAVE_ISULAD +- } + #endif ++ ret = lxc_terminal_prepare_login(ap->terminal_pts_fd); + if (ret < 0) { + SYSERROR("Failed to prepare terminal file descriptor %d", ap->terminal_pts_fd); + goto on_error; + } + + TRACE("Prepared terminal file descriptor %d", ap->terminal_pts_fd); ++#ifdef HAVE_ISULAD ++ } ++#endif + } + + /* Avoid unnecessary syscalls. */ +@@ -2264,7 +2263,7 @@ int lxc_attach(struct lxc_container *container, lxc_attach_exec_t exec_function, + + if (options->attach_flags & LXC_ATTACH_TERMINAL) { + #ifdef HAVE_ISULAD +- ret = isulad_safe_mainloop(&descr, -1); ++ ret = isulad_safe_mainloop(&descr, -1); + #else + ret = lxc_mainloop(&descr, -1); + #endif +diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c +index 4e4ae0c..0aaafa8 100644 +--- a/src/lxc/cgroups/cgfsng.c ++++ b/src/lxc/cgroups/cgfsng.c +@@ -528,32 +528,64 @@ static int cgroup_tree_remove_wrapper(void *data) + return cgroup_tree_remove(arg->hierarchies, arg->path_prune); + } + ++#ifdef HAVE_ISULAD ++__cgfsng_ops static bool cgfsng_payload_destroy(struct cgroup_ops *ops, ++ struct lxc_handler *handler) ++#else + __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, + struct lxc_handler *handler) ++#endif + { + int ret; + + if (!ops) { + ERROR("Called with uninitialized cgroup operations"); ++#ifdef HAVE_ISULAD ++ return false; ++#else + return; ++#endif ++ } ++ ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, ignore isulad_cgfsng_payload_destroy"); ++ return true; + } ++#endif + + if (!ops->hierarchies) ++#ifdef HAVE_ISULAD ++ return true; ++#else + return; ++#endif + + if (!handler) { + ERROR("Called with uninitialized handler"); ++#ifdef HAVE_ISULAD ++ return false; ++#else + return; ++#endif + } + + if (!handler->conf) { + ERROR("Called with uninitialized conf"); ++#ifdef HAVE_ISULAD ++ return false; ++#else + return; ++#endif + } + + if (!ops->container_limit_cgroup) { + WARN("Uninitialized limit cgroup"); ++#ifdef HAVE_ISULAD ++ return true; ++#else + return; ++#endif + } + + ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices); +@@ -579,6 +611,9 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, + } + if (ret < 0) + SYSWARN("Failed to destroy cgroups"); ++#ifdef HAVE_ISULAD ++ return ret >= 0; ++#endif + } + + #define __ISOL_CPUS "/sys/devices/system/cpu/isolated" +@@ -854,6 +889,10 @@ static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune, + __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, + struct lxc_handler *handler) + { ++#ifdef HAVE_ISULAD ++ // ignore destroy monitor cgroup ++ return; ++#endif + int len; + char pidstr[INTTYPE_TO_STRLEN(pid_t)]; + const struct lxc_conf *conf; +@@ -1311,6 +1350,10 @@ static char *cgroup_relpath(char *cg) + + __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler) + { ++#ifdef HAVE_ISULAD ++ // skip create monitor cgroup ++ return true; ++#endif + __do_free char *monitor_cgroup = NULL; + int idx = 0; + int i; +@@ -1411,6 +1454,17 @@ __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lx + if (!ops->hierarchies) + return true; + ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, isgnore isulad_cgfsng_payload_create"); ++ return true; ++ } ++ if (ops->container_cgroup) { ++ free(ops->container_cgroup); ++ ops->container_cgroup = NULL; ++ } ++#endif ++ + if (ops->container_cgroup || ops->container_limit_cgroup) + return ret_set_errno(false, EEXIST); + +@@ -1504,6 +1558,10 @@ __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lx + __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, + struct lxc_handler *handler) + { ++#ifdef HAVE_ISULAD ++ // ignore enter monitor cgroup ++ return true; ++#endif + int monitor_len, transient_len = 0; + char monitor[INTTYPE_TO_STRLEN(pid_t)], + transient[INTTYPE_TO_STRLEN(pid_t)]; +@@ -1577,6 +1635,13 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops, + if (!ops) + return ret_set_errno(false, ENOENT); + ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, isgnore isulad_cgfsng_payload_enter"); ++ return true; ++ } ++#endif ++ + if (!ops->hierarchies) + return true; + +@@ -1728,6 +1793,13 @@ __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops) + if (!ops) + return; + ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, isgnore isulad_cgfsng_payload_finalize"); ++ return; ++ } ++#endif ++ + if (!ops->hierarchies) + return; + +@@ -1900,7 +1972,11 @@ static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, + return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); + + target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); ++#ifdef HAVE_ISULAD ++ ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt, NULL); ++#else + ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); ++#endif + } + if (ret < 0) + return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", +@@ -1950,6 +2026,9 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, + struct lxc_rootfs *rootfs = &conf->rootfs; + const char *rootfs_mnt = get_rootfs_mnt(rootfs); + int ret; ++#ifdef HAVE_ISULAD ++ __do_free_string_list char **merged = NULL; ++#endif + + if (!ops) + return ret_set_errno(false, ENOENT); +@@ -2116,9 +2195,15 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, + MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); + } else { + cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); ++#ifdef HAVE_ISULAD ++ ret = safe_mount(NULL, cgroup_root, "tmpfs", ++ MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, ++ "size=10240k,mode=755", rootfs_mnt, NULL); ++#else + ret = safe_mount(NULL, cgroup_root, "tmpfs", + MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, + "size=10240k,mode=755", rootfs_mnt); ++#endif + } + if (ret < 0) + return log_error_errno(false, errno, "Failed to mount tmpfs on %s", +@@ -2134,6 +2219,16 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, + __do_free char *hierarchy_mnt = NULL, *path2 = NULL; + struct hierarchy *h = ops->hierarchies[i]; + ++#ifdef HAVE_ISULAD ++ // isulad: symlink subcgroup ++ if (strchr(h->at_mnt, ',') != NULL) { ++ int pret; ++ pret = lxc_append_string(&merged, h->at_mnt); ++ if (pret < 0) ++ return false; ++ } ++#endif ++ + ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); + if (ret < 0) + return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); +@@ -2165,8 +2260,14 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, + cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); + + hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); ++#ifdef HAVE_ISULAD ++ // isulad: ignore ops->container_cgroup so we will not see directory lxc after /sys/fs/cgroup/xxx in container, ++ // isulad: ignore h->container_base_path so we will not see subgroup of /sys/fs/cgroup/xxx/subgroup in container ++ path2 = must_make_path(h->at_mnt, NULL); ++#else + path2 = must_make_path(hierarchy_mnt, h->at_base, + ops->container_cgroup, NULL); ++#endif + ret = mkdir_p(path2, 0755); + if (ret < 0 && (errno != EEXIST)) + return false; +@@ -2178,6 +2279,64 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, + return false; + } + ++ ++#ifdef HAVE_ISULAD ++ // isulad: symlink subcgroup ++ // create symlink if no merged cgroup link ++ // like cpu -> cpu,cpuacct ++ if (merged) { ++ char **mc = NULL; ++ for (mc = merged; *mc; mc++) { ++ char *token = NULL; ++ __do_free char *copy = must_copy_string(*mc); ++ lxc_iterate_parts(token, copy, ",") { ++ int mret; ++ __do_free char *link = must_make_path(cgroup_root, token, NULL); ++ mret = symlink(*mc, link); ++ if (mret < 0 && errno != EEXIST) { ++ SYSERROR("Failed to create link %s for target %s", link, *mc); ++ return false; ++ } ++ } ++ } ++ } ++ ++ // isulad: remount /sys/fs/cgroup to readonly ++ if (cg_flags == LXC_AUTO_CGROUP_FULL_RO || cg_flags == LXC_AUTO_CGROUP_RO) { ++ ret = mount(cgroup_root, cgroup_root, "bind", ++ MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME|MS_RDONLY|MS_BIND|MS_REMOUNT, NULL); ++ if (ret < 0) { ++ SYSERROR("Failed to remount /sys/fs/cgroup."); ++ return false; ++ } ++ } ++ ++ // isulad: remount /sys/fs/cgroup/systemd to readwrite for system container ++ if (handler->conf->systemd != NULL && strcmp(handler->conf->systemd, "true") == 0) { ++ __do_free char *systemdpath = NULL; ++ __do_free char *unifiedpath = NULL; ++ unifiedpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/unified", NULL); ++ if (dir_exists(unifiedpath)) ++ { ++ ret = umount2(unifiedpath, MNT_DETACH); ++ if (ret < 0) ++ { ++ SYSERROR("Failed to umount /sys/fs/cgroup/unified."); ++ return false; ++ } ++ } ++ ++ systemdpath = must_make_path(get_rootfs_mnt(rootfs), "/sys/fs/cgroup/systemd", NULL); ++ ret = mount(systemdpath, systemdpath, "bind", ++ MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME | MS_BIND | MS_REMOUNT, NULL); ++ if (ret < 0) ++ { ++ SYSERROR("Failed to remount /sys/fs/cgroup/systemd."); ++ return false; ++ } ++ } ++#endif ++ + return true; + } + +@@ -2724,6 +2883,13 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, + if (!ops) + return ret_set_errno(false, ENOENT); + ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, isgnore isulad_cgfsng_attach"); ++ return true; ++ } ++#endif ++ + if (!ops->hierarchies) + return true; + +@@ -3064,6 +3230,209 @@ static int convert_devpath(const char *invalue, char *dest) + return 0; + } + ++#ifdef HAVE_ISULAD ++#define BATCH_SIZE 50 ++static void batch_realloc(char **mem, size_t oldlen, size_t newlen) ++{ ++ int newbatches = (newlen / BATCH_SIZE) + 1; ++ int oldbatches = (oldlen / BATCH_SIZE) + 1; ++ ++ if (!*mem || newbatches > oldbatches) ++ *mem = must_realloc(*mem, newbatches * BATCH_SIZE); ++} ++ ++static void append_line(char **dest, size_t oldlen, char *new, size_t newlen) ++{ ++ size_t full = oldlen + newlen; ++ ++ batch_realloc(dest, oldlen, full + 1); ++ ++ memcpy(*dest + oldlen, new, newlen + 1); ++} ++ ++/* Slurp in a whole file */ ++static char *read_file(const char *fnam) ++{ ++ __do_free char *buf = NULL, *line = NULL; ++ __do_fclose FILE *f = NULL; ++ size_t len = 0, fulllen = 0; ++ int linelen; ++ ++ f = fopen(fnam, "re"); ++ if (!f) ++ return NULL; ++ ++ while ((linelen = getline(&line, &len, f)) != -1) { ++ append_line(&buf, fulllen, line, linelen); ++ fulllen += linelen; ++ } ++ ++ return move_ptr(buf); ++} ++ ++static bool isulad_copy_parent_file(char *path, char *file) ++{ ++ int ret; ++ int len = 0; ++ char *value = NULL; ++ char *current = NULL; ++ char *fpath = NULL; ++ char *lastslash = NULL; ++ char oldv; ++ ++ fpath = must_make_path(path, file, NULL); ++ current = read_file(fpath); ++ ++ if (current == NULL) { ++ SYSERROR("Failed to read file \"%s\"", fpath); ++ free(fpath); ++ return false; ++ } ++ ++ if (strcmp(current, "\n") != 0) { ++ free(fpath); ++ free(current); ++ return true; ++ } ++ ++ free(fpath); ++ free(current); ++ ++ lastslash = strrchr(path, '/'); ++ if (lastslash == NULL) { ++ ERROR("Failed to detect \"/\" in \"%s\"", path); ++ return false; ++ } ++ oldv = *lastslash; ++ *lastslash = '\0'; ++ fpath = must_make_path(path, file, NULL); ++ *lastslash = oldv; ++ len = lxc_read_from_file(fpath, NULL, 0); ++ if (len <= 0) ++ goto on_error; ++ ++ value = must_realloc(NULL, len + 1); ++ ret = lxc_read_from_file(fpath, value, len); ++ if (ret != len) ++ goto on_error; ++ free(fpath); ++ ++ fpath = must_make_path(path, file, NULL); ++ ret = lxc_write_to_file(fpath, value, len, false, 0666); ++ if (ret < 0) ++ SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath); ++ free(fpath); ++ free(value); ++ return ret >= 0; ++ ++on_error: ++ SYSERROR("Failed to read file \"%s\"", fpath); ++ free(fpath); ++ free(value); ++ return false; ++} ++ ++static bool build_sub_cpuset_cgroup_dir(char *cgpath) ++{ ++ int ret; ++ ++ ret = mkdir_p(cgpath, 0755); ++ if (ret < 0) { ++ if (errno != EEXIST) { ++ SYSERROR("Failed to create directory \"%s\"", cgpath); ++ return false; ++ } ++ } ++ ++ /* copy parent's settings */ ++ if (!isulad_copy_parent_file(cgpath, "cpuset.cpus")) { ++ SYSERROR("Failed to copy \"cpuset.cpus\" settings"); ++ return false; ++ } ++ ++ /* copy parent's settings */ ++ if (!isulad_copy_parent_file(cgpath, "cpuset.mems")) { ++ SYSERROR("Failed to copy \"cpuset.mems\" settings"); ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool isulad_cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname) ++{ ++ char *cgpath, *slash; ++ bool sub_mk_success = false; ++ ++ if (is_unified_hierarchy(h)) ++ return true; ++ ++ if (!string_in_list(h->controllers, "cpuset")) ++ return true; ++ ++ cgname += strspn(cgname, "/"); ++ ++ slash = strchr(cgname, '/'); ++ ++ if (slash != NULL) { ++ while (slash) { ++ *slash = '\0'; ++ cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); ++ sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); ++ free(cgpath); ++ *slash = '/'; ++ if (!sub_mk_success) { ++ return false; ++ } ++ slash = strchr(slash + 1, '/'); ++ } ++ } ++ ++ cgpath = must_make_path(h->at_mnt, h->at_base, cgname, NULL); ++ sub_mk_success = build_sub_cpuset_cgroup_dir(cgpath); ++ free(cgpath); ++ if (!sub_mk_success) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static int isulad_mkdir_eexist_on_last(const char *dir, mode_t mode) ++{ ++ const char *tmp = dir; ++ const char *orig = dir; ++ ++ do { ++ int ret; ++ size_t cur_len; ++ char *makeme; ++ ++ dir = tmp + strspn(tmp, "/"); ++ tmp = dir + strcspn(dir, "/"); ++ ++ errno = ENOMEM; ++ cur_len = dir - orig; ++ makeme = strndup(orig, cur_len); ++ if (!makeme) ++ return -1; ++ ++ ret = mkdir(makeme, mode); ++ if (ret < 0) { ++ if (errno != EEXIST) { ++ SYSERROR("Failed to create directory \"%s\"", makeme); ++ free(makeme); ++ return -1; ++ } ++ } ++ free(makeme); ++ ++ } while (tmp != dir); ++ ++ return 0; ++} ++#endif ++ + /* Called from setup_limits - here we have the container's cgroup_data because + * we created the cgroups. + */ +@@ -3075,6 +3444,13 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, + /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ + char converted_value[50]; + struct hierarchy *h; ++#ifdef HAVE_ISULAD ++ int nret = 0; ++ int retry_count = 0; ++ int max_retry = 10; ++ char *fullpath; ++ char *container_cgroup = ops->container_cgroup; ++#endif + + controller = strdup(filename); + if (!controller) +@@ -3097,6 +3473,27 @@ static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, + if (!h) + return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller); + ++#ifdef HAVE_ISULAD ++ fullpath = must_make_path(h->path_con, filename, NULL); ++retry: ++ nret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); ++ if (nret != 0) { ++ if (retry_count < max_retry) { ++ SYSERROR("setting cgroup config for ready process caused \"failed to write %s to %s\".", value, fullpath); ++ (void)isulad_cg_legacy_handle_cpuset_hierarchy(h, container_cgroup); ++ (void)isulad_mkdir_eexist_on_last(h->path_con, 0755); ++ usleep(100 * 1000); /* 100 millisecond */ ++ retry_count++; ++ goto retry; ++ } ++ lxc_write_error_message(ops->errfd, ++ "%s:%d: setting cgroup config for ready process caused failed to write %s to %s: %s", ++ __FILE__, __LINE__, value, fullpath, strerror(errno)); ++ } ++ free(fullpath); ++ return nret; ++#endif ++ + if (is_cpuset) { + int ret = lxc_write_openat(h->path_con, filename, value, strlen(value)); + if (ret) +@@ -3131,12 +3528,76 @@ static void sort_cgroup_settings(struct lxc_conf *conf) + + } + ++#ifdef HAVE_ISULAD ++/* Called from setup_limits - here we have the container's cgroup_data because ++ * we created the cgroups. ++ */ ++static int isulad_cg_legacy_get_data(struct cgroup_ops *ops, const char *filename, ++ char *value, size_t len) ++{ ++ char *fullpath = NULL; ++ char *p = NULL; ++ struct hierarchy *h = NULL; ++ int ret = 0; ++ char *controller = NULL; ++ ++ len = strlen(filename); ++ if (SIZE_MAX - 1 < len) { ++ errno = EINVAL; ++ return -1; ++ } ++ controller = calloc(1, len + 1); ++ if (controller == NULL) { ++ errno = ENOMEM; ++ return -1; ++ } ++ (void)strlcpy(controller, filename, len + 1); ++ ++ p = strchr(controller, '.'); ++ if (p) ++ *p = '\0'; ++ ++ ++ h = get_hierarchy(ops, controller); ++ if (!h) { ++ ERROR("Failed to setup limits for the \"%s\" controller. " ++ "The controller seems to be unused by \"cgfsng\" cgroup " ++ "driver or not enabled on the cgroup hierarchy", ++ controller); ++ errno = ENOENT; ++ free(controller); ++ return -ENOENT; ++ } ++ ++ fullpath = must_make_path(h->path_con, filename, NULL); ++ ret = lxc_read_from_file(fullpath, value, len); ++ free(fullpath); ++ free(controller); ++ return ret; ++} ++ ++static char *trim(char *s) ++{ ++ size_t len; ++ ++ len = strlen(s); ++ while ((len > 1) && (s[len - 1] == '\n')) ++ s[--len] = '\0'; ++ ++ return s; ++} ++#endif ++ + __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, + struct lxc_conf *conf, + bool do_devices) + { + struct list_head *cgroup_settings; + struct lxc_cgroup *cgroup; ++#ifdef HAVE_ISULAD ++ char value[21 + 1] = { 0 }; ++ long long int readvalue, setvalue; ++#endif + + if (!ops) + return ret_set_errno(false, ENOENT); +@@ -3157,6 +3618,27 @@ __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, + sort_cgroup_settings(conf); + list_for_each_entry(cgroup, cgroup_settings, head) { + if (do_devices == strnequal("devices", cgroup->subsystem, 7)) { ++#ifdef HAVE_ISULAD ++ const char *cgvalue = cgroup->value; ++ if (strcmp("files.limit", cgroup->subsystem) == 0) { ++ if (lxc_safe_long_long(cgvalue, &setvalue) != 0) { ++ SYSERROR("Invalid integer value %s", cgvalue); ++ return false; ++ } ++ if (setvalue <= 0) { ++ cgvalue = "max"; ++ } ++ } ++ if (cg_legacy_set_data(ops, cgroup->subsystem, cgvalue, strnequal("cpuset", cgroup->subsystem, 6))) { ++ if (do_devices && (errno == EACCES || errno == EPERM)) { ++ SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); ++ continue; ++ } ++ SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgvalue); ++ return false; ++ } ++ DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgvalue); ++#else + if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) { + if (do_devices && (errno == EACCES || errno == EPERM)) { + SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); +@@ -3166,7 +3648,40 @@ __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, + return false; + } + DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value); ++#endif + } ++ ++#ifdef HAVE_ISULAD ++ // isulad: check cpu shares ++ if (strcmp(cgroup->subsystem, "cpu.shares") == 0) { ++ if (isulad_cg_legacy_get_data(ops, cgroup->subsystem, value, sizeof(value) - 1) < 0) { ++ SYSERROR("Error get %s", cgroup->subsystem); ++ return false; ++ } ++ trim(value); ++ if (lxc_safe_long_long(cgroup->value, &setvalue) != 0) { ++ SYSERROR("Invalid value %s", cgroup->value); ++ return false; ++ } ++ if (lxc_safe_long_long(value, &readvalue) != 0) { ++ SYSERROR("Invalid value %s", value); ++ return false; ++ } ++ if (setvalue > readvalue) { ++ ERROR("The maximum allowed cpu-shares is %s", value); ++ lxc_write_error_message(ops->errfd, ++ "%s:%d: setting cgroup config for ready process caused \"The maximum allowed cpu-shares is %s\".", ++ __FILE__, __LINE__, value); ++ return false; ++ } else if (setvalue < readvalue) { ++ ERROR("The minimum allowed cpu-shares is %s", value); ++ lxc_write_error_message(ops->errfd, ++ "%s:%d: setting cgroup config for ready process caused \"The minimum allowed cpu-shares is %s\".", ++ __FILE__, __LINE__, value); ++ return false; ++ } ++ } ++#endif + } + + INFO("Limits for the legacy cgroup hierarchies have been setup"); +@@ -3359,6 +3874,10 @@ static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cg + + __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops) + { ++#ifdef HAVE_ISULAD ++ // ignore monitor cgroup delegate controllers ++ return true; ++#endif + if (!ops) + return ret_set_errno(false, ENOENT); + +@@ -3370,6 +3889,13 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops * + if (!ops) + return ret_set_errno(false, ENOENT); + ++#ifdef HAVE_ISULAD ++ if (ops->no_controller) { ++ DEBUG("no controller found, isgnore isulad_cgfsng_payload_delegate_controllers"); ++ return true; ++ } ++#endif ++ + return __cgfsng_delegate_controllers(ops, ops->container_cgroup); + } + +@@ -3635,7 +4161,9 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, + if (!controller_list) { + TRACE("No controllers are enabled for delegation in the unified hierarchy"); + #ifdef HAVE_ISULAD ++ if (fhas_fs_type(ops->dfd_mnt, CGROUP2_SUPER_MAGIC)) { + ops->no_controller = true; ++ } + #endif + controller_list = list_new(); + if (!controller_list) +@@ -3839,9 +4367,18 @@ static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) + return 0; + } + ++#ifdef HAVE_ISULAD ++__cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops, struct lxc_conf *conf) ++#else + __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) ++#endif + { + const char *cgroup_pattern; ++#ifdef HAVE_ISULAD ++ const char *cgroup_tree; ++ __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL; ++ size_t len; ++#endif + + if (!ops) + return ret_set_errno(-1, ENOENT); +@@ -3854,9 +4391,48 @@ __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) + return ret_errno(ENOMEM); + } + ++#ifdef HAVE_ISULAD ++ if (conf->cgroup_meta.dir) { ++ cgroup_tree = conf->cgroup_meta.dir; ++ container_cgroup = must_concat(&len, cgroup_tree, "/", conf->name, NULL); ++ } else if (ops->cgroup_pattern) { ++ __cgroup_tree = lxc_string_replace("%n", conf->name, ops->cgroup_pattern); ++ if (!__cgroup_tree) ++ return ret_set_errno(-1, ENOMEM); ++ ++ cgroup_tree = __cgroup_tree; ++ container_cgroup = must_concat(&len, cgroup_tree, NULL); ++ } else { ++ cgroup_tree = NULL; ++ container_cgroup = must_concat(&len, conf->name, NULL); ++ } ++ if (!container_cgroup) ++ return ret_set_errno(-1, ENOMEM); ++ ++ ops->container_cgroup = move_ptr(container_cgroup); ++#endif ++ + return 0; + } + ++#ifdef HAVE_ISULAD ++__cgfsng_ops static const char *isulad_cgfsng_get_cgroup_full_path(struct cgroup_ops *ops, ++ const char *controller) ++{ ++ struct hierarchy *h; ++ ++ h = get_hierarchy(ops, controller); ++ if (!h) ++ return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", ++ controller ? controller : "(null)"); ++ ++ if (!h->path_con) ++ h->path_con = must_make_path(h->at_mnt, h->at_base, ops->container_cgroup, NULL); ++ ++ return h->path_con; ++} ++#endif ++ + struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) + { + __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL; +@@ -3888,7 +4464,12 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) + cgfsng_ops->unfreeze = cgfsng_unfreeze; + cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy; + cgfsng_ops->setup_limits = cgfsng_setup_limits; ++#ifdef HAVE_ISULAD ++ cgfsng_ops->driver = "isulad_cgfsng"; ++ cgfsng_ops->get_cgroup_full_path = isulad_cgfsng_get_cgroup_full_path; ++#else + cgfsng_ops->driver = "cgfsng"; ++#endif + cgfsng_ops->version = "1.0.0"; + cgfsng_ops->attach = cgfsng_attach; + cgfsng_ops->chown = cgfsng_chown; +diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h +index d9159f4..0a4ce64 100644 +--- a/src/lxc/cgroups/cgroup.h ++++ b/src/lxc/cgroups/cgroup.h +@@ -18,7 +18,11 @@ + + #define DEFAULT_CGROUP_MOUNTPOINT_RELATIVE "sys/fs/cgroup" + #define DEFAULT_CGROUP_MOUNTPOINT "/sys/fs/cgroup" ++#ifdef HAVE_ISULAD ++#define DEFAULT_PAYLOAD_CGROUP_PREFIX "" ++#else + #define DEFAULT_PAYLOAD_CGROUP_PREFIX "lxc.payload." ++#endif + #define DEFAULT_MONITOR_CGROUP_PREFIX "lxc.monitor." + #define DEFAULT_PAYLOAD_CGROUP "payload" + #define DEFAULT_MONITOR_CGROUP "monitor" +diff --git a/src/lxc/conf.c b/src/lxc/conf.c +index 34cf90a..ff5cefc 100644 +--- a/src/lxc/conf.c ++++ b/src/lxc/conf.c +@@ -4814,13 +4814,12 @@ int lxc_setup(struct lxc_handler *handler) + return log_error(-1, "Failed to verify start hooks"); + + #ifdef HAVE_ISULAD +- if (setup_proc) +-#endif +- ret = lxc_create_tmp_proc_mount(lxc_conf); +- if (ret < 0) +- return log_error(-1, "Failed to mount transient procfs instance for LSMs"); ++ if (setup_proc) { ++ ret = lxc_create_tmp_proc_mount(lxc_conf); ++ if (ret < 0) ++ return log_error(-1, "Failed to mount transient procfs instance for LSMs"); ++ } + +-#ifdef HAVE_ISULAD + if (setup_rootfs_mountopts(&lxc_conf->rootfs)) { + return log_error(-1, "failed to set rootfs for '%s'", name); + } +@@ -4831,6 +4830,10 @@ int lxc_setup(struct lxc_handler *handler) + } + } + #else ++ ret = lxc_create_tmp_proc_mount(lxc_conf); ++ if (ret < 0) ++ return log_error(-1, "Failed to mount transient procfs instance for LSMs"); ++ + ret = lxc_setup_devpts_child(handler); + if (ret < 0) + return log_error(-1, "Failed to prepare new devpts instance"); +@@ -4855,7 +4858,7 @@ int lxc_setup(struct lxc_handler *handler) + + #ifdef HAVE_ISULAD + /* Ask father to run oci prestart hooks and wait for him to finish. */ +- if (lxc_sync_barrier_parent(handler, START_SYNC_OCI_PRESTART_HOOK)) { ++ if (!lxc_sync_barrier_parent(handler, START_SYNC_OCI_PRESTART_HOOK)) { + return log_error(-1, "Failed to sync parent to start host hook"); + } + #endif +diff --git a/src/lxc/confile.c b/src/lxc/confile.c +index 0d0d66c..ae1a264 100644 +--- a/src/lxc/confile.c ++++ b/src/lxc/confile.c +@@ -287,16 +287,18 @@ static struct lxc_config_t config_jump_table[] = { + { "lxc.sysctl", false, set_config_sysctl, get_config_sysctl, clr_config_sysctl, }, + { "lxc.proc", false, set_config_proc, get_config_proc, clr_config_proc, }, + #ifdef HAVE_ISULAD +- { "lxc.isulad.init.args", true, set_config_init_args, get_config_init_args, clr_config_init_args, }, +- { "lxc.isulad.populate.device", true, set_config_populate_device, get_config_populate_device, clr_config_populate_device, }, +- { "lxc.isulad.umask", true, set_config_umask, get_config_umask, clr_config_umask, }, +- { "lxc.isulad.rootfs.maskedpaths", true, set_config_rootfs_masked_paths, get_config_rootfs_masked_paths, clr_config_rootfs_masked_paths, }, +- { "lxc.isulad.rootfs.ropaths", true, set_config_rootfs_ro_paths, get_config_rootfs_ro_paths, clr_config_rootfs_ro_paths, }, +- { "lxc.isulad.systemd", true, set_config_systemd, get_config_systemd, clr_config_systemd, }, +- { "lxc.console.logdriver", true, set_config_console_log_driver, get_config_console_log_driver, clr_config_console_log_driver, }, +- { "lxc.console.syslog_tag", true, set_config_console_syslog_tag, get_config_console_syslog_tag, clr_config_console_syslog_tag, }, +- { "lxc.console.syslog_facility", true, set_config_console_syslog_facility, get_config_console_syslog_facility, clr_config_console_syslog_facility, }, +- { "lxc.selinux.mount_context", true, set_config_selinux_mount_context, get_config_selinux_mount_context, clr_config_selinux_mount_context, }, ++ { "lxc.isulad.init.args", true, set_config_init_args, get_config_init_args, clr_config_init_args, }, ++ { "lxc.isulad.populate.device", true, set_config_populate_device, get_config_populate_device, clr_config_populate_device, }, ++ { "lxc.isulad.umask", true, set_config_umask, get_config_umask, clr_config_umask, }, ++ { "lxc.isulad.rootfs.maskedpaths", true, set_config_rootfs_masked_paths, get_config_rootfs_masked_paths, clr_config_rootfs_masked_paths, }, ++ { "lxc.isulad.rootfs.ropaths", true, set_config_rootfs_ro_paths, get_config_rootfs_ro_paths, clr_config_rootfs_ro_paths, }, ++ { "lxc.isulad.systemd", true, set_config_systemd, get_config_systemd, clr_config_systemd, }, ++ { "lxc.console.logdriver", true, set_config_console_log_driver, get_config_console_log_driver, clr_config_console_log_driver, }, ++ { "lxc.console.syslog_tag", true, set_config_console_syslog_tag, get_config_console_syslog_tag, clr_config_console_syslog_tag, }, ++ { "lxc.console.syslog_facility", true, set_config_console_syslog_facility, get_config_console_syslog_facility, clr_config_console_syslog_facility, }, ++ { "lxc.selinux.mount_context", true, set_config_selinux_mount_context, get_config_selinux_mount_context, clr_config_selinux_mount_context, }, ++ // same to lxc.init.groups ++ { "lxc.isulad.init.groups", true, set_config_init_groups, get_config_init_groups, clr_config_init_groups, }, + #endif + }; + +@@ -1343,7 +1345,11 @@ static int set_config_init_groups(const char *key, const char *value, + if (!value_dup) + return -ENOMEM; + ++#ifdef HAVE_ISULAD ++ lxc_iterate_parts(token, value_dup, " \t") ++#else + lxc_iterate_parts(token, value_dup, ",") ++#endif + num_groups++; + + if (num_groups == INT_MAX) +@@ -1368,7 +1374,11 @@ static int set_config_init_groups(const char *key, const char *value, + /* Restore duplicated value so we can call lxc_iterate_parts() again. */ + strcpy(value_dup, value); + ++#ifdef HAVE_ISULAD ++ lxc_iterate_parts(token, value_dup, " \t") { ++#else + lxc_iterate_parts(token, value_dup, ",") { ++#endif + int ret; + + gid_t group; +@@ -2000,13 +2010,13 @@ static int set_config_cgroup_dir(const char *key, const char *value, + + if (lxc_config_value_empty(value)) + return clr_config_cgroup_dir(key, lxc_conf, NULL); +- ++#ifndef HAVE_ISULAD + if (abspath(value)) + return syserror_set(-EINVAL, "%s paths may not be absolute", key); + + if (dotdot(value)) + return syserror_set(-EINVAL, "%s paths may not walk upwards via \"../\"", key); +- ++#endif + return set_config_path_item(&lxc_conf->cgroup_meta.dir, value); + } + +diff --git a/src/lxc/meson.build b/src/lxc/meson.build +index 3166401..6c4ba6a 100644 +--- a/src/lxc/meson.build ++++ b/src/lxc/meson.build +@@ -25,6 +25,7 @@ liblxcfs_version_file = configure_file( + ) + + liblxc_sources = files( ++ 'cgroups/cgfsng.c', + 'cgroups/cgroup.c', + 'cgroups/cgroup.h', + 'cgroups/cgroup2_devices.c', +@@ -140,7 +141,6 @@ liblxc_sources = files( + + if want_isulad + liblxc_sources += files( +- 'cgroups/isulad_cgfsng.c', + 'exec_commands.c', + 'exec_commands.h', + 'isulad_utils.c', +@@ -159,8 +159,6 @@ if want_isulad + 'json/oci_runtime_spec.h', + 'json/read-file.c', + 'json/read-file.h') +-else +- liblxc_sources += files('cgroups/cgfsng.c') + endif + + if want_apparmor and libapparmor.found() +diff --git a/src/lxc/start.c b/src/lxc/start.c +index ff9a3fa..a7bc2e6 100644 +--- a/src/lxc/start.c ++++ b/src/lxc/start.c +@@ -2384,13 +2384,12 @@ static int lxc_spawn(struct lxc_handler *handler) + goto out_delete_net; + } + +- /* Tell the child to continue its initialization. We'll get +- * START_SYNC_POST_OCI_PRESTART_HOOK when it is ready for us to run oci prestart hooks. +- */ +- if (lxc_sync_wake_child(handler, START_SYNC_POST_OCI_PRESTART_HOOK)) +- goto out_delete_net; ++ /* Tell the child to continue its initialization. We'll get ++ * START_SYNC_POST_OCI_PRESTART_HOOK when it is ready for us to run oci prestart hooks. ++ */ ++ if (!lxc_sync_wake_child(handler, START_SYNC_POST_OCI_PRESTART_HOOK)) ++ goto out_delete_net; + #endif +- + if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS)) + goto out_delete_net; + +-- +2.25.1 + diff --git a/lxc.spec b/lxc.spec index a1284b2..f4775b4 100644 --- a/lxc.spec +++ b/lxc.spec @@ -1,5 +1,5 @@ -%global _release 3 -%global enable_isulad 0 +%global _release 4 +%global enable_isulad 1 Name: lxc Version: 5.0.2 @@ -14,6 +14,8 @@ Patch0002: 0002-iSulad-adapt-security-conf-attach-cgroup-and-start.patch Patch0003: 0003-iSulad-adapt-conf-network-storage-and-termianl.patch Patch0004: 0004-iSulad-adapt-confile-lxccontainer-and-start.patch Patch0005: 0005-fix-compile-error.patch +Patch0006: 0006-remove-isulad_cgfsng.patch +Patch0007: 0007-fix-run-container-failed-when-enable-isulad.patch BuildRequires: systemd-units git libtool graphviz docbook2X doxygen chrpath BuildRequires: pkgconfig(libseccomp) @@ -201,6 +203,12 @@ meson test -C build %endif %changelog +* Wed Oct 18 2023 zhangxiaoyu - 5.0.2-4 +- Type: bugfix +- ID:NA +- SUG:NA +- DESC: fix run container failed when enable isulad + * Fri Aug 04 2023 zhangxiaoyu - 5.0.2-3 - Type:enhancement - ID:NA -- Gitee