From 8d42f8e682a572b56f4237294726b4cbc5914dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BF=A0=E5=87=8C?= Date: Mon, 24 Jun 2024 11:49:31 +0800 Subject: [PATCH] core: Fix 2 subcgroup deletion problems - core: Fix subcgroup deletion introduced by full delegation - core: Fix cgroups members mask cache propagation problem --- ...ly-simplify-caching-of-cgroups-membe.patch | 228 ++++++++++++++++++ ...group-FullDelegation-FullDelegationD.patch | 163 +++++++++++++ systemd.spec | 15 +- 3 files changed, 402 insertions(+), 4 deletions(-) create mode 100644 10028-cgroup-drastically-simplify-caching-of-cgroups-membe.patch create mode 100644 20009-core-introduce-cgroup-FullDelegation-FullDelegationD.patch diff --git a/10028-cgroup-drastically-simplify-caching-of-cgroups-membe.patch b/10028-cgroup-drastically-simplify-caching-of-cgroups-membe.patch new file mode 100644 index 0000000..d69f0ba --- /dev/null +++ b/10028-cgroup-drastically-simplify-caching-of-cgroups-membe.patch @@ -0,0 +1,228 @@ +From 5af8805872809e6de4cc4d9495cb1a904772ab4e Mon Sep 17 00:00:00 2001 +From: Lennart Poettering +Date: Fri, 23 Nov 2018 01:07:34 +0100 +Subject: [PATCH] cgroup: drastically simplify caching of cgroups members mask + +Previously we tried to be smart: when a new unit appeared and it only +added controllers to the cgroup mask we'd update the cached members mask +in all parents by ORing in the controller flags in their cached values. +Unfortunately this was quite broken, as we missed some conditions when +this cache had to be reset (for example, when a unit got unloaded), +moreover the optimization doesn't work when a controller is removed +anyway (as in that case there's no other way for the parent to iterate +though all children if any other, remaining child unit still needs it). +Hence, let's simplify the logic substantially: instead of updating the +cache on the right events (which we didn't get right), let's simply +invalidate the cache, and generate it lazily when we encounter it later. +This should actually result in better behaviour as we don't have to +calculate the new members mask for a whole subtree whever we have the +suspicion something changed, but can delay it to the point where we +actually need the members mask. + +This allows us to simplify things quite a bit, which is good, since +validating this cache for correctness is hard enough. + +Fixes: #9512 +--- + src/core/cgroup.c | 49 +++++------------------------------------ + src/core/cgroup.h | 2 +- + src/core/dbus-mount.c | 2 +- + src/core/dbus-scope.c | 2 +- + src/core/dbus-service.c | 2 +- + src/core/dbus-slice.c | 2 +- + src/core/dbus-socket.c | 2 +- + src/core/dbus-swap.c | 2 +- + src/core/unit.c | 3 ++- + src/core/unit.h | 2 -- + 10 files changed, 14 insertions(+), 54 deletions(-) + +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 6a5606f..d569077 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -1450,53 +1450,12 @@ bool unit_get_needs_bpf(Unit *u) { + return false; + } + +-/* Recurse from a unit up through its containing slices, propagating +- * mask bits upward. A unit is also member of itself. */ +-void unit_update_cgroup_members_masks(Unit *u) { +- CGroupMask m; +- bool more; +- ++void unit_invalidate_cgroup_members_masks(Unit *u) { + assert(u); +- +- /* Calculate subtree mask */ +- m = unit_get_subtree_mask(u); +- +- /* See if anything changed from the previous invocation. If +- * not, we're done. */ +- if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask) +- return; +- +- more = +- u->cgroup_subtree_mask_valid && +- ((m & ~u->cgroup_subtree_mask) != 0) && +- ((~m & u->cgroup_subtree_mask) == 0); +- +- u->cgroup_subtree_mask = m; +- u->cgroup_subtree_mask_valid = true; +- +- if (UNIT_ISSET(u->slice)) { +- Unit *s = UNIT_DEREF(u->slice); +- +- if (more) +- /* There's more set now than before. We +- * propagate the new mask to the parent's mask +- * (not caring if it actually was valid or +- * not). */ +- +- s->cgroup_members_mask |= m; +- +- else +- /* There's less set now than before (or we +- * don't know), we need to recalculate +- * everything, so let's invalidate the +- * parent's members mask */ +- +- s->cgroup_members_mask_valid = false; +- +- /* And now make sure that this change also hits our +- * grandparents */ +- unit_update_cgroup_members_masks(s); +- } ++ /* Recurse invalidate the member masks cache all the way up the tree */ ++ u->cgroup_members_mask_valid = false; ++ if (UNIT_ISSET(u->slice)) ++ unit_invalidate_cgroup_members_masks(UNIT_DEREF(u->slice)); + } + + const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { +diff --git a/src/core/cgroup.h b/src/core/cgroup.h +index 36ea77f..a2e1644 100644 +--- a/src/core/cgroup.h ++++ b/src/core/cgroup.h +@@ -181,7 +181,7 @@ CGroupMask unit_get_enable_mask(Unit *u); + + bool unit_get_needs_bpf(Unit *u); + +-void unit_update_cgroup_members_masks(Unit *u); ++void unit_invalidate_cgroup_members_masks(Unit *u); + + const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); + char *unit_default_cgroup_path(Unit *u); + +diff --git a/src/core/dbus-mount.c b/src/core/dbus-mount.c +index 3f98d3ecf0..b6d61627eb 100644 +--- a/src/core/dbus-mount.c ++++ b/src/core/dbus-mount.c +@@ -145,7 +145,7 @@ int bus_mount_set_property( + int bus_mount_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c +index 5d9fe98857..bb807df2e9 100644 +--- a/src/core/dbus-scope.c ++++ b/src/core/dbus-scope.c +@@ -186,7 +186,7 @@ int bus_scope_set_property( + int bus_scope_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c +index fdf6120610..10f53ef401 100644 +--- a/src/core/dbus-service.c ++++ b/src/core/dbus-service.c +@@ -424,7 +424,7 @@ int bus_service_set_property( + int bus_service_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/dbus-slice.c b/src/core/dbus-slice.c +index 722a5688a5..effd5fa5d7 100644 +--- a/src/core/dbus-slice.c ++++ b/src/core/dbus-slice.c +@@ -28,7 +28,7 @@ int bus_slice_set_property( + int bus_slice_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/dbus-socket.c b/src/core/dbus-socket.c +index 4ea5b6c6e5..3819653908 100644 +--- a/src/core/dbus-socket.c ++++ b/src/core/dbus-socket.c +@@ -461,7 +461,7 @@ int bus_socket_set_property( + int bus_socket_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/dbus-swap.c b/src/core/dbus-swap.c +index b272d10113..353fa20132 100644 +--- a/src/core/dbus-swap.c ++++ b/src/core/dbus-swap.c +@@ -63,7 +63,7 @@ int bus_swap_set_property( + int bus_swap_commit_properties(Unit *u) { + assert(u); + +- unit_update_cgroup_members_masks(u); ++ unit_invalidate_cgroup_members_masks(u); + unit_realize_cgroup(u); + + return 0; +diff --git a/src/core/unit.c b/src/core/unit.c +index 392cc2d7c5..a8c0f08e95 100644 +--- a/src/core/unit.c ++++ b/src/core/unit.c +@@ -1547,7 +1547,8 @@ int unit_load(Unit *u) { + if (u->job_running_timeout != USEC_INFINITY && u->job_running_timeout > u->job_timeout) + log_unit_warning(u, "JobRunningTimeoutSec= is greater than JobTimeoutSec=, it has no effect."); + +- unit_update_cgroup_members_masks(u); ++ /* We finished loading, let's ensure our parents recalculate the members mask */ ++ unit_invalidate_cgroup_members_masks(u); + } + + assert((u->load_state != UNIT_MERGED) == !u->merged_into); +diff --git a/src/core/unit.h b/src/core/unit.h +index b8b9147..e2dd794 100644 +--- a/src/core/unit.h ++++ b/src/core/unit.h +@@ -265,7 +265,6 @@ typedef struct Unit { + char *cgroup_path; + CGroupMask cgroup_realized_mask; + CGroupMask cgroup_enabled_mask; +- CGroupMask cgroup_subtree_mask; + CGroupMask cgroup_members_mask; + int cgroup_inotify_wd; + +@@ -341,7 +340,6 @@ typedef struct Unit { + + bool cgroup_realized:1; + bool cgroup_members_mask_valid:1; +- bool cgroup_subtree_mask_valid:1; + + UnitCGroupBPFState cgroup_bpf_state:2; + + diff --git a/20009-core-introduce-cgroup-FullDelegation-FullDelegationD.patch b/20009-core-introduce-cgroup-FullDelegation-FullDelegationD.patch new file mode 100644 index 0000000..bcd28b5 --- /dev/null +++ b/20009-core-introduce-cgroup-FullDelegation-FullDelegationD.patch @@ -0,0 +1,163 @@ +From f4fc78bb9b250e7e8f5197aa15055239276ec3cd Mon Sep 17 00:00:00 2001 +From: rpm-build +Date: Wed, 10 Jul 2024 17:46:57 +0800 +Subject: [PATCH] core: introduce cgroup FullDelegation, + FullDelegationDeviceCGroup for compability + +Whille using systemd-219, users can set 'delegate=y' to claim the +possession of cgroup settings. By then, users are able to write raw +values under /sys/fs/cgroup to adjust cgroup settings and systemd +won't touch these values any longer. + +However, this is likely to be an undefined behaviour for systemd-219. +Upon releasing systemd-239, a documentation of cgroup delegation was +added, +https://github.com/systemd/systemd/commit/e30eaff3a32523b09d61af67fc999f1f62f4e0cb. +It states that: + +Only sub-trees can be delegated (though whoever decides to request a +sub-tree can delegate sub-sub-trees further to somebody else if they +like it).' + +Which is quite different from what people understand the delegation of +systemd-219. Currently, whether a unit is delegated or not, systemd always +possesses any cgroup it created, only ignoring the sub-tree ones +according to delegation settings. + +This behaviour change causes confusion if users switch from systemd-219 to +systemd-239. As a result, we introduce 'FullDelegation', a feature that +brings what users are already familiar with to systemd-239. If users set +'FullDelegation=yes' in /etc/systemd/system.conf, they can control raw +values under /sys/fs/cgroup without worrying systemd touching these +values, which is the same as what they expected with systemd-219. + +The 'FullDelegation' option should not be enabled by default, as it alters the +default behavior that users are accustomed to or will become familiar with. +However, without enabling this option, GPU containers will not function +correctly. To address this issue, we have introduced +'FullDelegationDeviceCGroup', which replicates the behavior of systemd-219 +specifically for device cgroups. This option is enabled by default. + +During the use of earlier versions of systemd, we encountered bug reports +indicating that when 'FullDelegation' is enabled, subcgroups are removed by +systemd. This issue arises due to a flaw in our modification of the +`unit_realize_cgroup_now` function. We overlooked the fact that, in addition to +creating cgroups, the unit_create_cgroup function also deletes subcgroups based +on unset bits in the `target_mask`. To resolve this, we have adjusted the +procedure by moving the reduction of the `target_mask` to occur after the +execution of `unit_create_cgroup`, thereby preventing the unintended deletion of +subcgroups. +--- + src/core/cgroup.c | 24 ++++++++++++++++++++++++ + src/core/main.c | 7 +++++++ + src/core/manager.h | 2 ++ + src/core/system.conf.in | 2 ++ + 4 files changed, 35 insertions(+) + +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 8e474f6..6a5606f 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -1692,6 +1692,18 @@ static int unit_create_cgroup( + /* Keep track that this is now realized */ + u->cgroup_realized = true; + u->cgroup_realized_mask = target_mask; ++ ++ // While realizing cgroup, we don't realize delegated cgroup, therefore, target_mask ++ // doesn't contain delegated cgroup controller bit, and u->cgroup_realized_mask will ++ // not contain delegated cgroup controller bit as well. This unit will be in a state ++ // as if delegated cgroup is not set, which is not expected. ++ // If this is not present, delegated cgroup will be set every 2 systemctl daemon-reload ++ if (u->manager->full_delegation && unit_cgroup_delegate(u)) ++ u->cgroup_realized_mask |= unit_get_delegate_mask(u); ++ ++ if (u->manager->full_delegation_devicecg && unit_cgroup_delegate(u)) ++ u->cgroup_realized_mask |= (unit_get_delegate_mask(u) & CGROUP_MASK_DEVICES); ++ + u->cgroup_enabled_mask = enable_mask; + u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF; + +@@ -1940,6 +1952,12 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { + if (r < 0) + return r; + ++ if (u->manager->full_delegation && unit_cgroup_delegate(u)) ++ target_mask ^= u->cgroup_realized_mask; ++ ++ if (u->manager->full_delegation_devicecg && unit_cgroup_delegate(u)) ++ target_mask ^= (u->cgroup_realized_mask & CGROUP_MASK_DEVICES); ++ + /* Finally, apply the necessary attributes. */ + cgroup_context_apply(u, target_mask, apply_bpf, state); + cgroup_xattr_apply(u); +@@ -2882,6 +2900,12 @@ int unit_reset_ip_accounting(Unit *u) { + void unit_invalidate_cgroup(Unit *u, CGroupMask m) { + assert(u); + ++ if (u->manager->full_delegation) ++ m ^= unit_get_delegate_mask(u); // don't invalidate delegated cgroup ++ ++ if (u->manager->full_delegation_devicecg) ++ m ^= (unit_get_delegate_mask(u) & CGROUP_MASK_DEVICES); // don't invalidate device cgroup if delegate=yes ++ + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + +diff --git a/src/core/main.c b/src/core/main.c +index 546bf0d..e27f0a5 100644 +--- a/src/core/main.c ++++ b/src/core/main.c +@@ -142,6 +142,8 @@ static bool reexec_jmp_can = false; + static bool reexec_jmp_inited = false; + static sigjmp_buf reexec_jmp_buf; + static bool arg_default_cpuset_clone_children = false; ++static bool arg_full_delegation = false; ++static bool arg_full_delegation_devicecg = true; + + static int parse_configuration(const struct rlimit *saved_rlimit_nofile, + const struct rlimit *saved_rlimit_memlock); +@@ -768,6 +770,9 @@ static int parse_config_file(void) { + { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max }, + { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action }, + { "Manager", "DefaultCPUSetCloneChildren",config_parse_bool, 0, &arg_default_cpuset_clone_children }, ++ { "Manager", "FullDelegation", config_parse_bool, 0, &arg_full_delegation }, ++ { "Manager", "FullDelegationDeviceCGroup",config_parse_bool, 0, &arg_full_delegation_devicecg }, ++ + {} + }; + +@@ -817,6 +822,8 @@ static void set_manager_defaults(Manager *m) { + m->default_memory_accounting = arg_default_memory_accounting; + m->default_tasks_accounting = arg_default_tasks_accounting; + m->default_tasks_max = arg_default_tasks_max; ++ m->full_delegation = arg_full_delegation; ++ m->full_delegation_devicecg = arg_full_delegation_devicecg; + + manager_set_default_rlimits(m, arg_default_rlimit); + manager_environment_add(m, NULL, arg_default_environment); +diff --git a/src/core/manager.h b/src/core/manager.h +index 98d381b..8017d9a 100644 +--- a/src/core/manager.h ++++ b/src/core/manager.h +@@ -297,6 +297,8 @@ struct Manager { + bool default_blockio_accounting; + bool default_tasks_accounting; + bool default_ip_accounting; ++ bool full_delegation; ++ bool full_delegation_devicecg; + + uint64_t default_tasks_max; + usec_t default_timer_accuracy_usec; +diff --git a/src/core/system.conf.in b/src/core/system.conf.in +index 2f6852a..3f9ef7f 100644 +--- a/src/core/system.conf.in ++++ b/src/core/system.conf.in +@@ -67,3 +67,5 @@ DefaultLimitCORE=0:infinity + #DefaultLimitRTTIME= + #IPAddressAllow= + #IPAddressDeny= ++#FullDelegation=no ++#FullDelegationDeviceCGroup=yes +-- +2.39.3 + diff --git a/systemd.spec b/systemd.spec index cb2f21b..d5b5929 100644 --- a/systemd.spec +++ b/systemd.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.6 +%define anolis_release .0.7 #global gitcommit 10e465b5321bd53c1fc59ffab27e724535c6bc0f %{?gitcommit:%global gitcommitshort %(c=%{gitcommit}; echo ${c:0:7})} @@ -1060,15 +1060,17 @@ Patch10024: 10024-fileio-teach-read_full_file_full-to-read-from-offse.patch Patch10025: 10025-cryptsetup-port-cryptsetup-s-main-key-file-logic-ov.patch Patch10026: 10026-umount-check-LO_FLAGS_AUTOCLEAR-after-LOOP_CLR_FD-cl.patch Patch10027: 10027-fix-compilation-without-utmp.patch +Patch10028: 10028-cgroup-drastically-simplify-caching-of-cgroups-membe.patch Patch20001: 20001-hwdb-parse_hwdb_dot_py.patch # Patch20002: 20002-cgroup-do-not-refresh-cgroup-devices-config-when-dae.patch -Patch20003: 20003-core-introduce-cgroup-full-delegation-for-compabilit.patch +# Patch20003: 20003-core-introduce-cgroup-full-delegation-for-compabilit.patch Patch20004: 20004-Update-vendor-ids-for-ieisystem-0750.patch -Patch20005: 20005-default-enable-full-delegation-on-device-cgroup.patch +# Patch20005: 20005-default-enable-full-delegation-on-device-cgroup.patch Patch20006: 20006-systemd-Add-sw64.patch Patch20007: 20007-add-seccomp-support-for-sw_64.patch Patch20008: 20008-Fix-unit-test-test-seccomp-support-on-sw_64.patch +Patch20009: 20009-core-introduce-cgroup-FullDelegation-FullDelegationD.patch # lifsea only patch %if %{defined lifsea_dist} @@ -1546,10 +1548,11 @@ python3 %{SOURCE2} %buildroot < 239-78.0.7 +- core: Fix subcgroup deletion caused by full delegation +- core: Fix cgroups members mask cache propagation problem + * Thu Apr 18 2024 Weisson - 239-78.0.6 - add seccomp support for sw_64. - add test-seccomp support for sw_64. -- Gitee