From d3eef3b09a86f8119d0fda747f1e8c6eb5433567 Mon Sep 17 00:00:00 2001 From: zhongjiawei Date: Wed, 19 Jun 2024 16:14:59 +0800 Subject: [PATCH] runc:sync some patches --- git-commit | 2 +- ...ngle-CPU-affinity-before-cgroup-cpus.patch | 1081 +++++++++++++++++ patch/0041-runc-fix-a-data-race.patch | 26 + runc.spec | 8 +- series.conf | 2 + 5 files changed, 1117 insertions(+), 2 deletions(-) create mode 100644 patch/0040-runc-Set-temporary-single-CPU-affinity-before-cgroup-cpus.patch create mode 100644 patch/0041-runc-fix-a-data-race.patch diff --git a/git-commit b/git-commit index 0facadb..a0e9549 100644 --- a/git-commit +++ b/git-commit @@ -1 +1 @@ -288b7252b0db60842f5d7e1b9716c84c98f4ea30 +c628e7c7e16926a70d1385dfc447299647b8225a diff --git a/patch/0040-runc-Set-temporary-single-CPU-affinity-before-cgroup-cpus.patch b/patch/0040-runc-Set-temporary-single-CPU-affinity-before-cgroup-cpus.patch new file mode 100644 index 0000000..6e3097b --- /dev/null +++ b/patch/0040-runc-Set-temporary-single-CPU-affinity-before-cgroup-cpus.patch @@ -0,0 +1,1081 @@ +From 6ec9a7792737878c643ed9a6c74757b616821a5c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Clerget?= +Date: Fri, 30 Jun 2023 15:49:47 +0200 +Subject: [PATCH] Set temporary single CPU affinity before cgroup cpuset + transition. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This handles a corner case when joining a container having all +the processes running exclusively on isolated CPU cores to force +the kernel to schedule runc process on the first CPU core within the +cgroups cpuset. + +The introduction of the kernel commit +46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 has affected this deterministic +scheduling behavior by distributing tasks across CPU cores within the +cgroups cpuset. Some intensive real-time application are relying on this +deterministic behavior and use the first CPU core to run a slow thread +while other CPU cores are fully used by real-time threads with SCHED_FIFO +policy. Such applications prevents runc process from joining a container +when the runc process is randomly scheduled on a CPU core owned by a +real-time thread. + +Introduces isolated CPU affinity transition OCI runtime annotation +org.opencontainers.runc.exec.isolated-cpu-affinity-transition to restore +the behavior during runc exec. + +Fix issue with kernel >= 6.2 not resetting CPU affinity for container processes. + +Signed-off-by: Cédric Clerget +--- + docs/isolated-cpu-affinity-transition.md | 125 ++++++++ + features.go | 1 + + libcontainer/cgroups/cgroups.go | 4 + + libcontainer/cgroups/fs/fs.go | 27 ++ + libcontainer/cgroups/fs2/fs2.go | 28 ++ + libcontainer/cgroups/systemd/cpuset.go | 9 +- + libcontainer/cgroups/systemd/v1.go | 4 + + libcontainer/cgroups/systemd/v2.go | 4 + + libcontainer/container_linux_test.go | 4 + + libcontainer/process_linux.go | 271 +++++++++++++++++- + libcontainer/process_linux_test.go | 232 +++++++++++++++ + libcontainer/system/kernelparam/lookup.go | 41 +++ + .../system/kernelparam/lookup_test.go | 60 ++++ + tests/integration/helpers.bash | 21 ++ + 14 files changed, 828 insertions(+), 3 deletions(-) + create mode 100644 docs/isolated-cpu-affinity-transition.md + create mode 100644 libcontainer/process_linux_test.go + create mode 100644 libcontainer/system/kernelparam/lookup.go + create mode 100644 libcontainer/system/kernelparam/lookup_test.go + +diff --git a/docs/isolated-cpu-affinity-transition.md b/docs/isolated-cpu-affinity-transition.md +new file mode 100644 +index 0000000..d2f3b12 +--- /dev/null ++++ b/docs/isolated-cpu-affinity-transition.md +@@ -0,0 +1,125 @@ ++## Isolated CPU affinity transition ++ ++The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 ++in 5.7 has affected a deterministic scheduling behavior by distributing tasks ++across CPU cores within a cgroups cpuset. It means that `runc exec` might be ++impacted under some circumstances, by example when a container has been ++created within a cgroup cpuset entirely composed of isolated CPU cores ++usually sets either with `nohz_full` and/or `isolcpus` kernel boot parameters. ++ ++Some containerized real-time applications are relying on this deterministic ++behavior and uses the first CPU core to run a slow thread while other CPU ++cores are fully used by the real-time threads with SCHED_FIFO policy. ++Such applications can prevent runc process from joining a container when the ++runc process is randomly scheduled on a CPU core owned by a real-time thread. ++ ++Runc introduces a way to restore this behavior by adding the following ++annotation to the container runtime spec (`config.json`): ++ ++`org.opencontainers.runc.exec.isolated-cpu-affinity-transition` ++ ++This annotation can take one of those values: ++ ++* `temporary` to temporarily set the runc process CPU affinity to the first ++isolated CPU core of the container cgroup cpuset. ++* `definitive`: to definitively set the runc process CPU affinity to the first ++isolated CPU core of the container cgroup cpuset. ++ ++For example: ++ ++```json ++ "annotations": { ++ "org.opencontainers.runc.exec.isolated-cpu-affinity-transition": "temporary" ++ } ++``` ++ ++__WARNING:__ `definitive` requires a kernel >= 6.2, also works with RHEL 9 and ++above. ++ ++### How it works? ++ ++When enabled and during `runc exec`, runc is looking for the `nohz_full` kernel ++boot parameter value and considers the CPUs in the list as isolated, it doesn't ++look for `isolcpus` boot parameter, it just assumes that `isolcpus` value is ++identical to `nohz_full` when specified. If `nohz_full` parameter is not found, ++runc also attempts to read the list from `/sys/devices/system/cpu/nohz_full`. ++ ++Once it gets the isolated CPU list, it returns an eligible CPU core within the ++container cgroup cpuset based on those heuristics: ++ ++* when there is not cpuset cores: no eligible CPU ++* when there is not isolated cores: no eligible CPU ++* when cpuset cores are not in isolated core list: no eligible CPU ++* when cpuset cores are all isolated cores: return the first CPU of the cpuset ++* when cpuset cores are mixed between housekeeping/isolated cores: return the ++ first housekeeping CPU not in isolated CPUs. ++ ++The returned CPU core is then used to set the `runc init` CPU affinity before ++the container cgroup cpuset transition. ++ ++#### Transition example ++ ++`nohz_full` has the isolated cores `4-7`. A container has been created with ++the cgroup cpuset `4-7` to only run on the isolated CPU cores 4 to 7. ++`runc exec` is called by a process with CPU affinity set to `0-3` ++ ++* with `temporary` transition: ++ ++ runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4-7) ++ ++* with `definitive` transition: ++ ++ runc exec (affinity 0-3) -> runc init (affinity 4) -> container process (affinity 4) ++ ++The difference between `temporary` and `definitive` is the container process ++affinity, `definitive` will constraint the container process to run on the ++first isolated CPU core of the cgroup cpuset, while `temporary` restore the ++CPU affinity to match the container cgroup cpuset. ++ ++`definitive` transition might be helpful when `nohz_full` is used without ++`isolcpus` to avoid runc and container process to be a noisy neighbour for ++real-time applications. ++ ++### How to use it with Kubernetes? ++ ++Kubernetes doesn't manage container directly, instead it uses the Container Runtime ++Interface (CRI) to communicate with a software implementing this interface and responsible ++to manage the lifecycle of containers. There are popular CRI implementations like Containerd ++and CRI-O. Those implementations allows to pass pod annotations to the container runtime ++via the container runtime spec. Currently runc is the runtime used by default for both. ++ ++#### Containerd configuration ++ ++Containerd CRI uses runc by default but requires an extra step to pass the annotation to runc. ++You have to whitelist `org.opencontainers.runc.exec.isolated-cpu-affinity-transition` as a pod ++annotation allowed to be passed to the container runtime in `/etc/containerd/config.toml`: ++ ++```toml ++[plugins."io.containerd.grpc.v1.cri".containerd] ++ default_runtime_name = "runc" ++ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] ++ [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] ++ runtime_type = "io.containerd.runc.v2" ++ base_runtime_spec = "/etc/containerd/cri-base.json" ++ pod_annotations = ["org.opencontainers.runc.exec.isolated-cpu-affinity-transition"] ++``` ++ ++#### CRI-O configuration ++ ++CRI-O doesn't require any extra step, however some annotations could be excluded by ++configuration. ++ ++#### Pod deployment example ++ ++```yaml ++apiVersion: v1 ++kind: Pod ++metadata: ++ name: demo-pod ++ annotations: ++ org.opencontainers.runc.exec.isolated-cpu-affinity-transition: "temporary" ++spec: ++ containers: ++ - name: demo ++ image: registry.com/demo:latest ++``` +diff --git a/features.go b/features.go +index 7f76e7a..74c2f1a 100644 +--- a/features.go ++++ b/features.go +@@ -59,6 +59,7 @@ var featuresCommand = cli.Command{ + "bundle", + "org.systemd.property.", // prefix form + "org.criu.config", ++ "org.opencontainers.runc.exec.isolated-cpu-affinity-transition", + }, + } + +diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go +index ba2b226..c0e60f1 100644 +--- a/libcontainer/cgroups/cgroups.go ++++ b/libcontainer/cgroups/cgroups.go +@@ -56,4 +56,8 @@ type Manager interface { + + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) ++ ++ // GetEffectiveCPUs returns the effective CPUs of the cgroup, an empty ++ // value means that the cgroups cpuset subsystem/controller is not enabled. ++ GetEffectiveCPUs() string + } +diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go +index c85a1e2..135a688 100644 +--- a/libcontainer/cgroups/fs/fs.go ++++ b/libcontainer/cgroups/fs/fs.go +@@ -4,6 +4,8 @@ import ( + "errors" + "fmt" + "os" ++ "path/filepath" ++ "strings" + "sync" + + "golang.org/x/sys/unix" +@@ -264,3 +266,28 @@ func (m *manager) OOMKillCount() (uint64, error) { + + return c, err + } ++ ++func (m *manager) GetEffectiveCPUs() string { ++ return GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) ++} ++ ++func GetEffectiveCPUs(cpusetPath string, cgroups *configs.Cgroup) string { ++ // Fast path. ++ if cgroups.CpusetCpus != "" { ++ return cgroups.CpusetCpus ++ } else if !strings.HasPrefix(cpusetPath, defaultCgroupRoot) { ++ return "" ++ } ++ ++ // Iterates until it goes to the cgroup root path. ++ // It's required for containers in which cpuset controller ++ // is not enabled, in this case a parent cgroup is used. ++ for path := cpusetPath; path != defaultCgroupRoot; path = filepath.Dir(path) { ++ cpus, err := fscommon.GetCgroupParamString(path, "cpuset.effective_cpus") ++ if err == nil { ++ return cpus ++ } ++ } ++ ++ return "" ++} +diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go +index 492778e..96ebd75 100644 +--- a/libcontainer/cgroups/fs2/fs2.go ++++ b/libcontainer/cgroups/fs2/fs2.go +@@ -4,11 +4,13 @@ import ( + "errors" + "fmt" + "os" ++ "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ++ "github.com/opencontainers/runc/libcontainer/utils" + ) + + type parseError = fscommon.ParseError +@@ -32,6 +34,9 @@ func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) + if err != nil { + return nil, err + } ++ } else { ++ // Clean path for safety. ++ dirPath = utils.CleanPath(dirPath) + } + + m := &manager{ +@@ -257,3 +262,26 @@ func (m *manager) OOMKillCount() (uint64, error) { + + return c, err + } ++ ++func (m *manager) GetEffectiveCPUs() string { ++ // Fast path. ++ if m.config.CpusetCpus != "" { ++ return m.config.CpusetCpus ++ } else if !strings.HasPrefix(m.dirPath, UnifiedMountpoint) { ++ return "" ++ } ++ ++ // Iterates until it goes outside of the cgroup root path. ++ // It's required for containers in which cpuset controller ++ // is not enabled, in this case a parent cgroup is used. ++ outsidePath := filepath.Dir(UnifiedMountpoint) ++ ++ for path := m.dirPath; path != outsidePath; path = filepath.Dir(path) { ++ cpus, err := fscommon.GetCgroupParamString(path, "cpuset.cpus.effective") ++ if err == nil { ++ return cpus ++ } ++ } ++ ++ return "" ++} +diff --git a/libcontainer/cgroups/systemd/cpuset.go b/libcontainer/cgroups/systemd/cpuset.go +index dd474cf..ede7a63 100644 +--- a/libcontainer/cgroups/systemd/cpuset.go ++++ b/libcontainer/cgroups/systemd/cpuset.go +@@ -14,7 +14,9 @@ import ( + func RangeToBits(str string) ([]byte, error) { + bits := new(big.Int) + +- for _, r := range strings.Split(str, ",") { ++ splits := strings.Split(str, "+") ++ cpusetstr := splits[0] ++ for _, r := range strings.Split(cpusetstr, ",") { + // allow extra spaces around + r = strings.TrimSpace(r) + // allow empty elements (extra commas) +@@ -52,6 +54,11 @@ func RangeToBits(str string) ([]byte, error) { + return nil, errors.New("empty value") + } + ++ // fit cpuset parsing order in systemd ++ for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { ++ ret[l], ret[r] = ret[r], ret[l] ++ } ++ + // fit cpuset parsing order in systemd + for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { + ret[l], ret[r] = ret[r], ret[l] +diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go +index c1e58c9..21d15f5 100644 +--- a/libcontainer/cgroups/systemd/v1.go ++++ b/libcontainer/cgroups/systemd/v1.go +@@ -479,3 +479,7 @@ func (m *legacyManager) Exists() bool { + func (m *legacyManager) OOMKillCount() (uint64, error) { + return fs.OOMKillCount(m.Path("memory")) + } ++ ++func (m *legacyManager) GetEffectiveCPUs() string { ++ return fs.GetEffectiveCPUs(m.Path("cpuset"), m.cgroups) ++} +diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go +index 919e563..c88df28 100644 +--- a/libcontainer/cgroups/systemd/v2.go ++++ b/libcontainer/cgroups/systemd/v2.go +@@ -470,3 +470,7 @@ func (m *unifiedManager) Exists() bool { + func (m *unifiedManager) OOMKillCount() (uint64, error) { + return m.fsMgr.OOMKillCount() + } ++ ++func (m *unifiedManager) GetEffectiveCPUs() string { ++ return m.fsMgr.GetEffectiveCPUs() ++} +diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go +index 1a4b027..eb204bf 100644 +--- a/libcontainer/container_linux_test.go ++++ b/libcontainer/container_linux_test.go +@@ -76,6 +76,10 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { + return configs.Thawed, nil + } + ++func (m *mockCgroupManager) GetEffectiveCPUs() string { ++ return "" ++} ++ + func (m *mockIntelRdtManager) Apply(pid int) error { + return nil + } +diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go +index 3b453cc..58391b6 100644 +--- a/libcontainer/process_linux.go ++++ b/libcontainer/process_linux.go +@@ -1,23 +1,28 @@ + package libcontainer + + import ( ++ "bytes" + "encoding/json" + "errors" + "fmt" + "io" ++ "io/fs" + "net" + "os" + "os/exec" + "path/filepath" ++ "runtime" + "strconv" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" ++ "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/logs" + "github.com/opencontainers/runc/libcontainer/system" ++ "github.com/opencontainers/runc/libcontainer/system/kernelparam" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +@@ -84,8 +89,60 @@ func (p *setnsProcess) start() (retErr error) { + defer p.messageSockPair.parent.Close() + // get the "before" value of oom kill count + oom, _ := p.manager.OOMKillCount() +- err := p.cmd.Start() +- // close the write-side of the pipes (controlled by child) ++ ++ // When greater or equal to zero, it will set a temporary single CPU ++ // affinity before cgroup cpuset transition, this handles a corner ++ // case when joining a container having all the processes running ++ // exclusively on isolated CPU cores to force the kernel to schedule ++ // runc process on the first CPU core within the cgroups cpuset. ++ // The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 ++ // in 5.7 has affected this deterministic scheduling behavior by ++ // distributing tasks across CPU cores within the cgroups cpuset. ++ // Some intensive real-time application are relying on this ++ // deterministic behavior and use the first CPU core to run a slow ++ // thread while other CPU cores are fully used by real-time threads ++ // with SCHED_FIFO policy. Such applications prevent runc process ++ // from joining a container when the runc process is randomly ++ // scheduled on a CPU core owned by a real-time thread. ++ cpuAffinity := -1 ++ resetCPUAffinity := true ++ ++ if len(p.manager.GetPaths()) > 0 { ++ // Get the target container cgroup. ++ if cg, err := p.manager.GetCgroups(); err != nil { ++ // Close the pipe to not be blocked in the parent. ++ p.messageSockPair.child.Close() ++ p.logFilePair.child.Close() ++ return fmt.Errorf("getting container cgroups: %w", err) ++ } else if cg.CpusetCpus != "" { ++ definitive := false ++ ++ _, annotations := utils.Annotations(p.config.Config.Labels) ++ cpuAffinity, definitive, err = isolatedCPUAffinityTransition( ++ os.DirFS("/"), ++ cg.CpusetCpus, ++ annotations, ++ ) ++ if err != nil { ++ // Close the pipe to not be blocked in the parent. ++ p.messageSockPair.child.Close() ++ p.logFilePair.child.Close() ++ return fmt.Errorf("getting CPU affinity: %w", err) ++ } else if definitive { ++ resetCPUAffinity = false ++ } ++ } ++ } ++ ++ var err error ++ ++ if cpuAffinity < 0 { ++ err = p.cmd.Start() ++ } else { ++ err = startCommandWithCPUAffinity(p.cmd, cpuAffinity) ++ } ++ ++ // Close the write-side of the pipes (controlled by child). + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { +@@ -145,6 +202,18 @@ func (p *setnsProcess) start() (retErr error) { + } + } + } ++ ++ if resetCPUAffinity { ++ // Fix the container process CPU affinity to match container cgroup cpuset, ++ // since kernel 6.2, the runc CPU affinity might affect the container process ++ // CPU affinity after cgroup cpuset transition, by example if runc is running ++ // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the ++ // resulting container process CPU affinity will be 1 instead of 1-2. ++ if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { ++ return fmt.Errorf("error resetting container process CPU affinity: %w", err) ++ } ++ } ++ + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) +@@ -569,6 +638,14 @@ func (p *initProcess) start() (retErr error) { + if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { + return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) + } ++ // Reset container process CPU affinity to match container cgroup cpuset, ++ // since kernel 6.2, the runc CPU affinity might affect the container process ++ // CPU affinity after cgroup cpuset transition, by example if runc is running ++ // with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the ++ // resulting container process CPU affinity will be 1 instead of 1-2. ++ if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil { ++ return fmt.Errorf("error resetting container process CPU affinity: %w", err) ++ } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) +@@ -826,3 +903,193 @@ func initWaiter(r io.Reader) chan error { + + return ch + } ++ ++// isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics ++// and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value. ++func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) { ++ const ( ++ isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" ++ nohzFullParam = "nohz_full" ++ ) ++ ++ definitive := false ++ ++ transition := annotations[isolatedCPUAffinityTransitionAnnotation] ++ switch transition { ++ case "temporary": ++ case "definitive": ++ definitive = true ++ default: ++ if transition != "" { ++ return -1, false, fmt.Errorf( ++ "unknown transition value %q for annotation %s", ++ transition, isolatedCPUAffinityTransitionAnnotation, ++ ) ++ } ++ return -1, false, nil ++ } ++ ++ kernelParams, err := kernelparam.LookupKernelBootParameters( ++ rootFS, ++ nohzFullParam, ++ ) ++ if err != nil { ++ // If /proc/cmdline does not exist or isn't readable, continue to read ++ // nohz_full from sysfs below. ++ if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) { ++ return -1, false, err ++ } ++ } ++ ++ // First get nohz_full value from kernel boot params, if not ++ // present, get the value from sysfs, to cover the case where ++ // CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration ++ // tests not dependent on /sys/devices/system/cpu/nohz_full. ++ isolatedList := kernelParams[nohzFullParam] ++ if isolatedList == "" { ++ // Get the isolated CPU list, the error is not checked here because ++ // no matter what the error is, it returns without error the same way ++ // as with empty data. ++ isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full") ++ isolatedList = string(bytes.TrimSpace(isolatedData)) ++ if isolatedList == "" || isolatedList == "(null)" { ++ return -1, false, nil ++ } ++ } ++ ++ cpu, err := getEligibleCPU(cpusetList, isolatedList) ++ if err != nil { ++ return -1, false, fmt.Errorf("getting eligible cpu: %w", err) ++ } else if cpu == -1 { ++ definitive = false ++ } ++ ++ return cpu, definitive, nil ++} ++ ++// getEligibleCPU returns the first eligible CPU for CPU affinity before ++// entering in a cgroup cpuset: ++// - when there is not cpuset cores: no eligible CPU (-1) ++// - when there is not isolated cores: no eligible CPU (-1) ++// - when cpuset cores are not in isolated cores: no eligible CPU (-1) ++// - when cpuset cores are all isolated cores: return the first CPU of the cpuset ++// - when cpuset cores are mixed between housekeeping/isolated cores: return the ++// first housekeeping CPU not in isolated CPUs. ++func getEligibleCPU(cpusetList, isolatedList string) (int, error) { ++ if isolatedList == "" || cpusetList == "" { ++ return -1, nil ++ } ++ ++ // The target container has a cgroup cpuset, get the bit range. ++ cpusetBits, err := systemd.RangeToBits(cpusetList) ++ if err != nil { ++ return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) ++ } ++ ++ isolatedBits, err := systemd.RangeToBits(isolatedList) ++ if err != nil { ++ return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err) ++ } ++ ++ eligibleCore := -1 ++ isolatedCores := 0 ++ ++ // Start from cpu core #0. ++ currentCore := 0 ++ // Handle mixed sets. ++ mixed := false ++ ++ // CPU core start from the first slice element and bits are read ++ // from the least to the most significant bit. ++ for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { ++ if byteRange >= len(isolatedBits) { ++ // No more isolated cores. ++ break ++ } ++ for bit := 0; bit < 8; bit++ { ++ if cpusetBits[byteRange]&(1< 0 { ++ return eligibleCore, nil ++ } ++ } ++ currentCore++ ++ } ++ } ++ ++ // We have an eligible CPU if there is at least one isolated CPU in the cpuset. ++ if isolatedCores == 0 { ++ return -1, nil ++ } ++ ++ return eligibleCore, nil ++} ++ ++// startCommandWithCPUAffinity starts a command on a specific CPU if set. ++func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error { ++ errCh := make(chan error) ++ defer close(errCh) ++ ++ // Use a goroutine to dedicate an OS thread. ++ go func() { ++ cpuSet := new(unix.CPUSet) ++ cpuSet.Zero() ++ cpuSet.Set(cpuAffinity) ++ ++ // Don't call runtime.UnlockOSThread to terminate the OS thread ++ // when goroutine exits. ++ runtime.LockOSThread() ++ ++ // Command inherits the CPU affinity. ++ if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil { ++ errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err) ++ return ++ } ++ ++ errCh <- cmd.Start() ++ }() ++ ++ return <-errCh ++} ++ ++// fixProcessCPUAffinity sets the CPU affinity of a container process ++// to all CPUs allowed by container cgroup cpuset. ++func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error { ++ cpusetList := manager.GetEffectiveCPUs() ++ if cpusetList == "" { ++ // If the cgroup cpuset is not present, the container will inherit ++ // this process CPU affinity, so it can return without further actions. ++ return nil ++ } ++ ++ cpusetBits, err := systemd.RangeToBits(cpusetList) ++ if err != nil { ++ return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err) ++ } ++ ++ processCPUSet := new(unix.CPUSet) ++ ++ for byteRange := 0; byteRange < len(cpusetBits); byteRange++ { ++ for bit := 0; bit < 8; bit++ { ++ processCPUSet.Set(byteRange*8 + bit) ++ } ++ } ++ ++ if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil { ++ return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err) ++ } ++ ++ return nil ++} +diff --git a/libcontainer/process_linux_test.go b/libcontainer/process_linux_test.go +new file mode 100644 +index 0000000..8303643 +--- /dev/null ++++ b/libcontainer/process_linux_test.go +@@ -0,0 +1,232 @@ ++package libcontainer ++ ++import ( ++ "io/fs" ++ "testing" ++ "testing/fstest" ++) ++ ++func TestIsolatedCPUAffinityTransition(t *testing.T) { ++ const isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition" ++ ++ noAffinity := -1 ++ temporaryTransition := "temporary" ++ definitiveTransition := "definitive" ++ ++ tests := []struct { ++ name string ++ testFS fs.FS ++ cpuset string ++ expectedErr bool ++ expectedAffinityCore int ++ expectedDefinitiveTransition bool ++ annotations map[string]string ++ }{ ++ { ++ name: "no affinity", ++ cpuset: "0-15", ++ testFS: fstest.MapFS{ ++ "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, ++ }, ++ expectedAffinityCore: noAffinity, ++ expectedDefinitiveTransition: false, ++ }, ++ { ++ name: "affinity match with temporary transition", ++ cpuset: "3-4", ++ testFS: fstest.MapFS{ ++ "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, ++ }, ++ expectedAffinityCore: 3, ++ expectedDefinitiveTransition: false, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: temporaryTransition, ++ }, ++ }, ++ { ++ name: "affinity match with temporary transition and nohz_full boot param", ++ cpuset: "3-4", ++ testFS: fstest.MapFS{ ++ "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, ++ }, ++ expectedAffinityCore: 3, ++ expectedDefinitiveTransition: false, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: temporaryTransition, ++ }, ++ }, ++ { ++ name: "affinity match with definitive transition", ++ cpuset: "3-4", ++ testFS: fstest.MapFS{ ++ "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("0-4\n")}, ++ }, ++ expectedAffinityCore: 3, ++ expectedDefinitiveTransition: true, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: definitiveTransition, ++ }, ++ }, ++ { ++ name: "affinity match with definitive transition and nohz_full boot param", ++ cpuset: "3-4", ++ testFS: fstest.MapFS{ ++ "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=0-4\n")}, ++ }, ++ expectedAffinityCore: 3, ++ expectedDefinitiveTransition: true, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: definitiveTransition, ++ }, ++ }, ++ { ++ name: "affinity error with bad isolated set", ++ cpuset: "0-15", ++ testFS: fstest.MapFS{ ++ "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("bad_isolated_set\n")}, ++ }, ++ expectedErr: true, ++ expectedAffinityCore: noAffinity, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: temporaryTransition, ++ }, ++ }, ++ { ++ name: "affinity error with bad isolated set for nohz_full boot param", ++ cpuset: "0-15", ++ testFS: fstest.MapFS{ ++ "proc/cmdline": &fstest.MapFile{Data: []byte("nohz_full=bad_isolated_set\n")}, ++ }, ++ expectedErr: true, ++ expectedAffinityCore: noAffinity, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: temporaryTransition, ++ }, ++ }, ++ { ++ name: "no affinity with null isolated set value", ++ cpuset: "0-15", ++ testFS: fstest.MapFS{ ++ "sys/devices/system/cpu/nohz_full": &fstest.MapFile{Data: []byte("(null)\n")}, ++ }, ++ expectedAffinityCore: noAffinity, ++ expectedDefinitiveTransition: false, ++ annotations: map[string]string{ ++ isolatedCPUAffinityTransitionAnnotation: temporaryTransition, ++ }, ++ }, ++ } ++ ++ for _, tt := range tests { ++ t.Run(tt.name, func(t *testing.T) { ++ affinityCore, definitive, err := isolatedCPUAffinityTransition(tt.testFS, tt.cpuset, tt.annotations) ++ if err != nil && !tt.expectedErr { ++ t.Fatalf("unexpected error: %s", err) ++ } else if err == nil && tt.expectedErr { ++ t.Fatalf("unexpected success") ++ } else if tt.expectedDefinitiveTransition != definitive { ++ t.Fatalf("expected reset affinity %t: got %t instead", tt.expectedDefinitiveTransition, definitive) ++ } else if tt.expectedAffinityCore != affinityCore { ++ t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) ++ } ++ }) ++ } ++} ++ ++func TestGetEligibleCPU(t *testing.T) { ++ tests := []struct { ++ name string ++ cpuset string ++ isolset string ++ expectedErr bool ++ expectedAffinityCore int ++ expectedEligible bool ++ }{ ++ { ++ name: "no cpuset", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: false, ++ }, ++ { ++ name: "no isolated set", ++ cpuset: "0-15", ++ expectedEligible: false, ++ }, ++ { ++ name: "bad cpuset format", ++ cpuset: "core0 to core15", ++ isolset: "2-15,18-31,34-47", ++ expectedErr: true, ++ }, ++ { ++ name: "bad isolated set format", ++ cpuset: "0-15", ++ isolset: "core0 to core15", ++ expectedErr: true, ++ }, ++ { ++ name: "no eligible core", ++ cpuset: "0-1,16-17,32-33", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: false, ++ }, ++ { ++ name: "no eligible core inverted", ++ cpuset: "2-15,18-31,34-47", ++ isolset: "0-1,16-17,32-33", ++ expectedEligible: false, ++ }, ++ { ++ name: "eligible core mixed", ++ cpuset: "8-31", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: true, ++ expectedAffinityCore: 16, ++ }, ++ { ++ name: "eligible core #4", ++ cpuset: "4-7", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: true, ++ expectedAffinityCore: 4, ++ }, ++ { ++ name: "eligible core #40", ++ cpuset: "40-47", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: true, ++ expectedAffinityCore: 40, ++ }, ++ { ++ name: "eligible core #24", ++ cpuset: "24-31", ++ isolset: "2-15,18-31,34-47", ++ expectedEligible: true, ++ expectedAffinityCore: 24, ++ }, ++ { ++ name: "no eligible core small isolated set", ++ cpuset: "60-63", ++ isolset: "0-1", ++ expectedEligible: false, ++ }, ++ } ++ ++ for _, tt := range tests { ++ t.Run(tt.name, func(t *testing.T) { ++ affinityCore, err := getEligibleCPU(tt.cpuset, tt.isolset) ++ eligible := affinityCore >= 0 ++ if err != nil && !tt.expectedErr { ++ t.Fatalf("unexpected error: %s", err) ++ } else if err == nil && tt.expectedErr { ++ t.Fatalf("unexpected success") ++ } else if tt.expectedEligible && !eligible { ++ t.Fatalf("was expecting eligible core but no eligible core returned") ++ } else if !tt.expectedEligible && eligible { ++ t.Fatalf("was not expecting eligible core but got eligible core") ++ } else if tt.expectedEligible && tt.expectedAffinityCore != affinityCore { ++ t.Fatalf("expected affinity core %d: got %d instead", tt.expectedAffinityCore, affinityCore) ++ } ++ }) ++ } ++} +diff --git a/libcontainer/system/kernelparam/lookup.go b/libcontainer/system/kernelparam/lookup.go +new file mode 100644 +index 0000000..4cf4524 +--- /dev/null ++++ b/libcontainer/system/kernelparam/lookup.go +@@ -0,0 +1,41 @@ ++package kernelparam ++ ++import ( ++ "io/fs" ++ "strings" ++) ++ ++func runeFilter(c rune) bool { ++ return c < '!' || c > '~' ++} ++ ++// LookupKernelBootParameters returns the selected kernel parameters specified ++// in the kernel command line. The parameters are returned as a map of key-value pairs. ++func LookupKernelBootParameters(rootFS fs.FS, lookupParameters ...string) (map[string]string, error) { ++ cmdline, err := fs.ReadFile(rootFS, "proc/cmdline") ++ if err != nil { ++ return nil, err ++ } ++ ++ kernelParameters := make(map[string]string) ++ remaining := len(lookupParameters) ++ ++ for _, parameter := range strings.FieldsFunc(string(cmdline), runeFilter) { ++ if remaining == 0 { ++ break ++ } ++ idx := strings.IndexByte(parameter, '=') ++ if idx == -1 { ++ continue ++ } ++ for _, lookupParam := range lookupParameters { ++ if lookupParam == parameter[:idx] { ++ kernelParameters[lookupParam] = parameter[idx+1:] ++ remaining-- ++ break ++ } ++ } ++ } ++ ++ return kernelParameters, nil ++} +diff --git a/libcontainer/system/kernelparam/lookup_test.go b/libcontainer/system/kernelparam/lookup_test.go +new file mode 100644 +index 0000000..9d90630 +--- /dev/null ++++ b/libcontainer/system/kernelparam/lookup_test.go +@@ -0,0 +1,60 @@ ++package kernelparam ++ ++import ( ++ "testing" ++ "testing/fstest" ++) ++ ++func TestLookupKernelBootParameters(t *testing.T) { ++ for _, test := range []struct { ++ cmdline string ++ lookupParameters []string ++ expectedKernelParameters map[string]string ++ }{ ++ { ++ cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", ++ lookupParameters: []string{"root"}, ++ expectedKernelParameters: map[string]string{ ++ "root": "/dev/sda1", ++ }, ++ }, ++ { ++ cmdline: "ro runc.kernel_parameter=a_value console=ttyS0 console=tty0", ++ lookupParameters: []string{"runc.kernel_parameter"}, ++ expectedKernelParameters: map[string]string{ ++ "runc.kernel_parameter": "a_value", ++ }, ++ }, ++ { ++ cmdline: "ro runc.kernel_parameter_a=value_a runc.kernel_parameter_b=value_a:value_b", ++ lookupParameters: []string{ ++ "runc.kernel_parameter_a", ++ "runc.kernel_parameter_b", ++ }, ++ expectedKernelParameters: map[string]string{ ++ "runc.kernel_parameter_a": "value_a", ++ "runc.kernel_parameter_b": "value_a:value_b", ++ }, ++ }, ++ { ++ cmdline: "root=/dev/sda1 ro console=ttyS0 console=tty0", ++ lookupParameters: []string{"runc.kernel_parameter_a"}, ++ expectedKernelParameters: map[string]string{}, ++ }, ++ } { ++ params, err := LookupKernelBootParameters(fstest.MapFS{ ++ "proc/cmdline": &fstest.MapFile{Data: []byte(test.cmdline + "\n")}, ++ }, test.lookupParameters...) ++ if err != nil { ++ t.Fatalf("unexpected error: %s", err) ++ } ++ if len(params) != len(test.expectedKernelParameters) { ++ t.Fatalf("expected %d parameters, got %d", len(test.expectedKernelParameters), len(params)) ++ } ++ for k, v := range test.expectedKernelParameters { ++ if params[k] != v { ++ t.Fatalf("expected parameter %s to be %s, got %s", k, v, params[k]) ++ } ++ } ++ } ++} +diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash +index 23e32e9..0718a1f 100644 +--- a/tests/integration/helpers.bash ++++ b/tests/integration/helpers.bash +@@ -325,6 +325,27 @@ function set_cgroup_mount_writable() { + update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' + } + ++# Helper function to get all online cpus. ++function get_all_online_cpus() { ++ cat /sys/devices/system/cpu/online ++} ++ ++# Helper function to get the first online cpu. ++function get_first_online_cpu() { ++ [[ $(get_all_online_cpus) =~ [^0-9]*([0-9]+)([-,][0-9]+)? ]] && echo "${BASH_REMATCH[1]}" ++} ++ ++# Helper function to set all cpus/mems in container cgroup cpuset. ++function set_cgroup_cpuset_all_cpus() { ++ update_config ".linux.resources.cpu.cpus = \"$(get_all_online_cpus)\"" ++ ++ local mems ++ mems="$(cat /sys/devices/system/node/online 2>/dev/null || true)" ++ if [[ -n $mems ]]; then ++ update_config ".linux.resources.cpu.mems = \"$mems\"" ++ fi ++} ++ + # Fails the current test, providing the error given. + function fail() { + echo "$@" >&2 +-- +2.33.0 + diff --git a/patch/0041-runc-fix-a-data-race.patch b/patch/0041-runc-fix-a-data-race.patch new file mode 100644 index 0000000..2a42c4f --- /dev/null +++ b/patch/0041-runc-fix-a-data-race.patch @@ -0,0 +1,26 @@ +From d1ef3ab619c7743d389fc882ec65df38d140fc08 Mon Sep 17 00:00:00 2001 +From: zhongjiawei +Date: Mon, 17 Jun 2024 23:22:39 +0800 +Subject: [PATCH] libct/config: fix a data race + +Reference:https://github.com/opencontainers/runc/commit/c342872276d4a3d5f662684115e282cbb20bf227 +--- + libcontainer/configs/config.go | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go +index f85ade3f..c9ecc3cb 100644 +--- a/libcontainer/configs/config.go ++++ b/libcontainer/configs/config.go +@@ -455,7 +455,7 @@ func (c Command) Run(s *specs.State) error { + return err + case <-timerCh: + cmd.Process.Kill() +- cmd.Wait() ++ <-errC + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + case <-timeAfter: + if c.Timeout != nil { +-- +2.33.0 + diff --git a/runc.spec b/runc.spec index 479666f..a697537 100644 --- a/runc.spec +++ b/runc.spec @@ -3,7 +3,7 @@ Name: runc Version: 1.1.8 -Release: 16 +Release: 17 Summary: runc is a CLI tool for spawning and running containers according to the OCI specification. License: ASL 2.0 @@ -57,6 +57,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc %{_bindir}/runc %changelog +* Wed Jun 19 2024 zhongjiawei - 1.1.8-17 +- Type:bugfix +- CVE:NA +- SUG:NA +- DESC:sync some patches + * Thu May 23 2024 zhongjiawei - 1.1.8-16 - Type:bugfix - CVE:NA diff --git a/series.conf b/series.conf index 681bf5b..bd6ecb0 100644 --- a/series.conf +++ b/series.conf @@ -36,3 +36,5 @@ patch/0036-runc-increase-the-number-of-cgroup-deletion-retries.patch patch/0037-runc-fix-CVE-2024-21626.patch patch/0038-runc-check-cmd-exist.patch patch/0039-runc-fix-CVE-2024-3154.patch +patch/0040-runc-Set-temporary-single-CPU-affinity-before-cgroup-cpus.patch +patch/0041-runc-fix-a-data-race.patch -- Gitee