From 1a6ece72043f497f98aa5f6c3d845395a04fe202 Mon Sep 17 00:00:00 2001 From: zhongjiawei Date: Thu, 1 Feb 2024 16:51:35 +0800 Subject: [PATCH] runc:fix CVE-2024-21626 (cherry picked from commit 6e9b77988428e4184978084eccfa08612f3c5b0f) --- patch/0148-runc-fix-CVE-2024-21626.patch | 2765 ++++++++++++++++++++++ runc.spec | 8 +- series.conf | 1 + 3 files changed, 2773 insertions(+), 1 deletion(-) create mode 100644 patch/0148-runc-fix-CVE-2024-21626.patch diff --git a/patch/0148-runc-fix-CVE-2024-21626.patch b/patch/0148-runc-fix-CVE-2024-21626.patch new file mode 100644 index 0000000..cb87495 --- /dev/null +++ b/patch/0148-runc-fix-CVE-2024-21626.patch @@ -0,0 +1,2765 @@ +From e81938064402940ca8176d6f3145f65b1d455996 Mon Sep 17 00:00:00 2001 +From: zhongjiawei +Date: Thu, 1 Feb 2024 18:25:16 +0800 +Subject: [PATCH] runc:fix CVE-2024-21626 + +--- + libcontainer/container_linux.go | 50 +- + libcontainer/container_linux.go.orig | 1660 ----------------- + libcontainer/factory_linux.go | 15 +- + libcontainer/init_linux.go | 39 +- + libcontainer/process_linux.go | 3 +- + libcontainer/setns_init_linux.go | 19 + + libcontainer/standard_init_linux.go | 28 +- + libcontainer/standard_init_linux.go.orig | 223 --- + libcontainer/utils/utils.go | 38 - + libcontainer/utils/utils_unix.go | 253 ++- + vendor/golang.org/x/sys/unix/flock.go | 5 + + .../x/sys/unix/zerrors_linux_amd64.go | 1 + + .../x/sys/unix/zerrors_linux_arm64.go | 1 + + .../x/sys/unix/zsyscall_linux_amd64.go | 10 + + .../x/sys/unix/zsyscall_linux_arm64.go | 10 + + .../x/sys/unix/zsysnum_linux_amd64.go | 1 + + .../x/sys/unix/zsysnum_linux_arm64.go | 1 + + .../x/sys/unix/ztypes_linux_amd64.go | 5 + + .../x/sys/unix/ztypes_linux_arm64.go | 5 + + 19 files changed, 403 insertions(+), 1964 deletions(-) + delete mode 100644 libcontainer/container_linux.go.orig + delete mode 100644 libcontainer/standard_init_linux.go.orig + +diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go +index a4859ca..c757d71 100644 +--- a/libcontainer/container_linux.go ++++ b/libcontainer/container_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -28,6 +29,7 @@ import ( + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" + "github.com/vishvananda/netlink/nl" ++ "golang.org/x/sys/unix" + ) + + const stdioFdCount = 3 +@@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error { + }() + } + ++ // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC ++ // to make sure we don't leak any files into "runc init". Any files to be ++ // passed to "runc init" through ExtraFiles will get dup2'd by the Go ++ // runtime and thus their O_CLOEXEC flag will be cleared. This is some ++ // additional protection against attacks like CVE-2024-21626, by making ++ // sure we never leak files to "runc init" we didn't intend to. ++ if err := utils.CloseExecFrom(3); err != nil { ++ return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) ++ } + if err := parent.start(); err != nil { + // terminate the process to ensure that it properly is reaped. + if err := parent.terminate(); err != nil { +@@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() { + os.Remove(fifoName) + } + ++// includeExecFifo opens the container's execfifo as a pathfd, so that the ++// container cannot access the statedir (and the FIFO itself remains ++// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited ++// fd, with _LIBCONTAINER_FIFOFD set to its fd number. ++func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { ++ fifoName := filepath.Join(c.root, execFifoFilename) ++ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) ++ if err != nil { ++ return err ++ } ++ ++ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) ++ cmd.Env = append(cmd.Env, ++ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) ++ return nil ++} ++ + func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + parentPipe, childPipe, err := utils.NewSockPair("init") + if err != nil { +@@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + } + +- // We only set up rootDir if we're not doing a `runc exec`. The reason for +- // this is to avoid cases where a racing, unprivileged process inside the +- // container can get access to the statedir file descriptor (which would +- // allow for container rootfs escape). +- rootDir, err := os.Open(c.root) +- if err != nil { +- return nil, err ++ // We only set up fifoFd if we're not doing a `runc exec`. The historic ++ // reason for this is that previously we would pass a dirfd that allowed ++ // for container rootfs escape (and not doing it in `runc exec` avoided ++ // that problem), but we no longer do that. However, there's no need to do ++ // this for `runc exec` so we just keep it this way to be safe. ++ if err := c.includeExecFifo(cmd); err != nil { ++ return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + } +- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) +- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) ++ return c.newInitProcess(p, cmd, parentPipe, childPipe) + } + + func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +@@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. + return cmd, nil + } + +-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { ++func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { +@@ -501,7 +526,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c + process: p, + bootstrapData: data, + sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), +- rootDir: rootDir, + }, nil + } + +diff --git a/libcontainer/container_linux.go.orig b/libcontainer/container_linux.go.orig +deleted file mode 100644 +index d678407..0000000 +--- a/libcontainer/container_linux.go.orig ++++ /dev/null +@@ -1,1660 +0,0 @@ +-// +build linux +- +-package libcontainer +- +-import ( +- "bytes" +- "encoding/json" +- "errors" +- "fmt" +- "io" +- "io/ioutil" +- "os" +- "os/exec" +- "path/filepath" +- "reflect" +- "strings" +- "sync" +- "syscall" +- "time" +- +- "github.com/Sirupsen/logrus" +- "github.com/golang/protobuf/proto" +- "github.com/opencontainers/runc/libcontainer/cgroups" +- "github.com/opencontainers/runc/libcontainer/configs" +- "github.com/opencontainers/runc/libcontainer/criurpc" +- "github.com/opencontainers/runc/libcontainer/logs" +- "github.com/opencontainers/runc/libcontainer/system" +- "github.com/opencontainers/runc/libcontainer/utils" +- "github.com/syndtr/gocapability/capability" +- "github.com/vishvananda/netlink/nl" +-) +- +-const stdioFdCount = 3 +- +-type linuxContainer struct { +- id string +- root string +- config *configs.Config +- cgroupManager cgroups.Manager +- initArgs []string +- initProcess parentProcess +- initProcessStartTime string +- criuPath string +- m sync.Mutex +- criuVersion int +- state containerState +- created time.Time +-} +- +-// State represents a running container's state +-type State struct { +- BaseState +- +- // Platform specific fields below here +- +- // Specifies if the container was started under the rootless mode. +- Rootless bool `json:"rootless"` +- +- // Path to all the cgroups setup for a container. Key is cgroup subsystem name +- // with the value as the path. +- CgroupPaths map[string]string `json:"cgroup_paths"` +- +- // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type +- // with the value as the path. +- NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` +- +- // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore +- ExternalDescriptors []string `json:"external_descriptors,omitempty"` +-} +- +-// CompatState +-type CompatState struct { +- State +- Config configs.CompatConfig `json:"config"` +-} +- +-// Container is a libcontainer container object. +-// +-// Each container is thread-safe within the same process. Since a container can +-// be destroyed by a separate process, any function may return that the container +-// was not found. +-type Container interface { +- BaseContainer +- +- // Methods below here are platform specific +- +- // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. +- // +- // errors: +- // Systemerror - System error. +- Checkpoint(criuOpts *CriuOpts) error +- +- // Restore restores the checkpointed container to a running state using the criu(8) utility. +- // +- // errors: +- // Systemerror - System error. +- Restore(process *Process, criuOpts *CriuOpts) error +- +- // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses +- // the execution of any user processes. Asynchronously, when the container finished being paused the +- // state is changed to PAUSED. +- // If the Container state is PAUSED, do nothing. +- // +- // errors: +- // ContainerNotExists - Container no longer exists, +- // ContainerNotRunning - Container not running or created, +- // Systemerror - System error. +- Pause() error +- +- // If the Container state is PAUSED, resumes the execution of any user processes in the +- // Container before setting the Container state to RUNNING. +- // If the Container state is RUNNING, do nothing. +- // +- // errors: +- // ContainerNotExists - Container no longer exists, +- // ContainerNotPaused - Container is not paused, +- // Systemerror - System error. +- Resume() error +- +- // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. +- // +- // errors: +- // Systemerror - System error. +- NotifyOOM() (<-chan struct{}, error) +- +- // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level +- // +- // errors: +- // Systemerror - System error. +- NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) +-} +- +-// ID returns the container's unique ID +-func (c *linuxContainer) ID() string { +- return c.id +-} +- +-// Config returns the container's configuration +-func (c *linuxContainer) Config() configs.Config { +- return *c.config +-} +- +-func (c *linuxContainer) Status() (Status, error) { +- c.m.Lock() +- defer c.m.Unlock() +- return c.currentStatus() +-} +- +-func (c *linuxContainer) State() (*State, error) { +- c.m.Lock() +- defer c.m.Unlock() +- return c.currentState() +-} +- +-func (c *linuxContainer) Processes() ([]int, error) { +- pids, err := c.cgroupManager.GetAllPids() +- if err != nil { +- return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") +- } +- return pids, nil +-} +- +-func (c *linuxContainer) Stats() (*Stats, error) { +- var ( +- err error +- stats = &Stats{} +- ) +- if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { +- return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") +- } +- for _, iface := range c.config.Networks { +- switch iface.Type { +- case "veth": +- istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) +- if err != nil { +- return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) +- } +- stats.Interfaces = append(stats.Interfaces, istats) +- } +- } +- return stats, nil +-} +- +-func (c *linuxContainer) Set(config configs.Config) error { +- c.m.Lock() +- defer c.m.Unlock() +- status, err := c.currentStatus() +- if err != nil { +- return err +- } +- if status == Stopped { +- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) +- } +- c.config = &config +- return c.cgroupManager.Set(c.config) +-} +- +-func (c *linuxContainer) Start(process *Process) error { +- c.m.Lock() +- defer c.m.Unlock() +- if process.Init { +- if err := c.createExecFifo(); err != nil { +- return err +- } +- } +- if err := c.start(process); err != nil { +- if process.Init { +- c.deleteExecFifo() +- } +- return err +- } +- return nil +-} +- +-func (c *linuxContainer) Run(process *Process) error { +- if err := c.Start(process); err != nil { +- return err +- } +- if process.Init { +- return c.exec() +- } +- return nil +-} +- +-func (c *linuxContainer) Exec() error { +- c.m.Lock() +- defer c.m.Unlock() +- return c.exec() +-} +- +-func (c *linuxContainer) exec() error { +- path := filepath.Join(c.root, execFifoFilename) +- +- fifoOpen := make(chan struct{}) +- select { +- case <-awaitProcessExit(c.initProcess.pid(), fifoOpen): +- return errors.New("container process is already dead") +- case result := <-awaitFifoOpen(path, fifoOpen): +- if result.err != nil { +- return result.err +- } +- f := result.file +- defer f.Close() +- if err := readFromExecFifo(f); err != nil { +- return err +- } +- if err := os.Remove(path); !os.IsNotExist(err) { +- return err +- } +- return nil +- } +-} +- +-func readFromExecFifo(execFifo io.Reader) error { +- data, err := ioutil.ReadAll(execFifo) +- if err != nil { +- return err +- } +- if len(data) <= 0 { +- return fmt.Errorf("cannot start an already running container") +- } +- return nil +-} +- +-func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} { +- isDead := make(chan struct{}) +- go func() { +- for { +- select { +- case <-exit: +- return +- case <-time.After(time.Millisecond * 100): +- stat, err := system.GetProcessState(pid) +- if err != nil || stat == system.Zombie { +- select { +- case <-exit: +- return +- default: +- close(isDead) +- } +- return +- } +- } +- } +- }() +- return isDead +-} +- +-func awaitFifoOpen(path string, fifoOpen chan struct{}) <-chan openResult { +- fifoOpened := make(chan openResult) +- go func() { +- f, err := os.OpenFile(path, os.O_RDONLY, 0) +- close(fifoOpen) +- if err != nil { +- fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} +- return +- } +- fifoOpened <- openResult{file: f} +- }() +- return fifoOpened +-} +- +-type openResult struct { +- file *os.File +- err error +-} +- +-func (c *linuxContainer) start(process *Process) error { +- parent, err := c.newParentProcess(process) +- if err != nil { +- return newSystemErrorWithCause(err, "creating new parent process") +- } +- +- if logsDone := logs.ForwardLogs(); logsDone != nil { +- defer func() { +- select { +- case <-logsDone: +- case <-time.After(3 * time.Second): +- logrus.Warnf("wait child close logfd timeout") +- } +- }() +- } +- +- if err := parent.start(); err != nil { +- // terminate the process to ensure that it properly is reaped. +- if err := parent.terminate(); err != nil { +- logrus.Warnf("parent process terminate error: %v", err) +- } +- return newSystemErrorWithCause(err, "starting container process") +- } +- // generate a timestamp indicating when the container was started +- c.created = time.Now().UTC() +- if process.Init { +- c.state = &createdState{ +- c: c, +- } +- state, err := c.updateState(parent) +- if err != nil { +- return err +- } +- c.initProcessStartTime = state.InitProcessStartTime +- +- if c.config.Hooks != nil { +- s := configs.HookState{ +- SpecState: configs.SpecState{ +- Version: c.config.Version, +- ID: c.id, +- Pid: parent.pid(), +- Bundle: utils.SearchLabels(c.config.Labels, "bundle"), +- }, +- Root: c.config.Rootfs, +- } +- for i, hook := range c.config.Hooks.Poststart { +- logrus.Infof("run poststart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID) +- if err := hook.Run(s); err != nil { +- logrus.Warnf("running poststart hook %d:%s failed: %s, ContainerId: %s", i, hook.Info(), err, s.ID) +- } +- } +- } +- } else { +- c.state = &runningState{ +- c: c, +- } +- } +- return nil +-} +- +-func (c *linuxContainer) Signal(s os.Signal, all bool) error { +- if all { +- return signalAllProcesses(c.cgroupManager, s) +- } +- status, err := c.currentStatus() +- if err != nil { +- return err +- } +- // to avoid a PID reuse attack +- if status == Running || status == Created { +- if err := c.initProcess.signal(s); err != nil { +- return newSystemErrorWithCause(err, "signaling init process") +- } +- return nil +- } +- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) +-} +- +-func (c *linuxContainer) createExecFifo() error { +- rootuid, err := c.Config().HostRootUID() +- if err != nil { +- return err +- } +- rootgid, err := c.Config().HostRootGID() +- if err != nil { +- return err +- } +- +- fifoName := filepath.Join(c.root, execFifoFilename) +- if _, err := os.Stat(fifoName); err == nil { +- return fmt.Errorf("exec fifo %s already exists", fifoName) +- } +- oldMask := syscall.Umask(0000) +- if err := syscall.Mkfifo(fifoName, 0622); err != nil { +- syscall.Umask(oldMask) +- return err +- } +- syscall.Umask(oldMask) +- if err := os.Chown(fifoName, rootuid, rootgid); err != nil { +- return err +- } +- return nil +-} +- +-func (c *linuxContainer) deleteExecFifo() { +- fifoName := filepath.Join(c.root, execFifoFilename) +- os.Remove(fifoName) +-} +- +-func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { +- parentPipe, childPipe, err := utils.NewSockPair("init") +- if err != nil { +- return nil, newSystemErrorWithCause(err, "creating new init pipe") +- } +- if err := logs.InitLogPipe(); err != nil { +- return nil, fmt.Errorf("Unable to create the log pipe: %s", err) +- } +- cmd, err := c.commandTemplate(p, childPipe) +- if err != nil { +- return nil, newSystemErrorWithCause(err, "creating new command template") +- } +- if !p.Init { +- return c.newSetnsProcess(p, cmd, parentPipe, childPipe) +- } +- +- // We only set up rootDir if we're not doing a `runc exec`. The reason for +- // this is to avoid cases where a racing, unprivileged process inside the +- // container can get access to the statedir file descriptor (which would +- // allow for container rootfs escape). +- rootDir, err := os.Open(c.root) +- if err != nil { +- return nil, err +- } +- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) +- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) +-} +- +-func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +- cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) +- cmd.Stdin = p.Stdin +- cmd.Stdout = p.Stdout +- cmd.Stderr = p.Stderr +- cmd.Dir = c.config.Rootfs +- if cmd.SysProcAttr == nil { +- cmd.SysProcAttr = &syscall.SysProcAttr{} +- } +- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) +- if p.ConsoleSocket != nil { +- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), +- ) +- } +- cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), +- ) +- +- cmd.ExtraFiles = append(cmd.ExtraFiles, logs.ChildLogPipe) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), +- ) +- +- // NOTE: when running a container with no PID namespace and the parent process spawning the container is +- // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason +- // even with the parent still running. +- if c.config.ParentDeathSignal > 0 { +- cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) +- } +- return cmd, nil +-} +- +-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { +- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) +- nsMaps := make(map[configs.NamespaceType]string) +- for _, ns := range c.config.Namespaces { +- if ns.Path != "" { +- nsMaps[ns.Type] = ns.Path +- } +- } +- data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) +- if err != nil { +- return nil, err +- } +- return &initProcess{ +- cmd: cmd, +- childPipe: childPipe, +- parentPipe: parentPipe, +- manager: c.cgroupManager, +- config: c.newInitConfig(p), +- container: c, +- process: p, +- bootstrapData: data, +- sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), +- rootDir: rootDir, +- }, nil +-} +- +-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { +- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) +- state, err := c.currentState() +- if err != nil { +- return nil, newSystemErrorWithCause(err, "getting container's current state") +- } +- // for setns process, we don't have to set cloneflags as the process namespaces +- // will only be set via setns syscall +- data, err := c.bootstrapData(0, state.NamespacePaths) +- if err != nil { +- return nil, err +- } +- return &setnsProcess{ +- cmd: cmd, +- cgroupPaths: c.cgroupManager.GetPaths(), +- childPipe: childPipe, +- parentPipe: parentPipe, +- config: c.newInitConfig(p), +- process: p, +- bootstrapData: data, +- }, nil +-} +- +-func (c *linuxContainer) newInitConfig(process *Process) *initConfig { +- cfg := &initConfig{ +- Config: c.config, +- Args: process.Args, +- Env: process.Env, +- User: process.User, +- AdditionalGroups: process.AdditionalGroups, +- Cwd: process.Cwd, +- Capabilities: process.Capabilities, +- PassedFilesCount: len(process.ExtraFiles), +- ContainerId: c.ID(), +- NoNewPrivileges: c.config.NoNewPrivileges, +- Rootless: c.config.Rootless, +- AppArmorProfile: c.config.AppArmorProfile, +- ProcessLabel: c.config.ProcessLabel, +- Rlimits: c.config.Rlimits, +- } +- if process.NoNewPrivileges != nil { +- cfg.NoNewPrivileges = *process.NoNewPrivileges +- } +- if process.AppArmorProfile != "" { +- cfg.AppArmorProfile = process.AppArmorProfile +- } +- if process.Label != "" { +- cfg.ProcessLabel = process.Label +- } +- if len(process.Rlimits) > 0 { +- cfg.Rlimits = process.Rlimits +- } +- cfg.CreateConsole = process.ConsoleSocket != nil +- return cfg +-} +- +-func (c *linuxContainer) Destroy() error { +- c.m.Lock() +- defer c.m.Unlock() +- return c.state.destroy() +-} +- +-func (c *linuxContainer) Pause() error { +- c.m.Lock() +- defer c.m.Unlock() +- status, err := c.currentStatus() +- if err != nil { +- return err +- } +- switch status { +- case Running, Created: +- if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { +- return err +- } +- return c.state.transition(&pausedState{ +- c: c, +- }) +- } +- return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) +-} +- +-func (c *linuxContainer) Resume() error { +- c.m.Lock() +- defer c.m.Unlock() +- status, err := c.currentStatus() +- if err != nil { +- return err +- } +- if status != Paused { +- return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) +- } +- if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { +- return err +- } +- return c.state.transition(&runningState{ +- c: c, +- }) +-} +- +-func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { +- // XXX(cyphar): This requires cgroups. +- if c.config.Rootless { +- return nil, fmt.Errorf("cannot get OOM notifications from rootless container") +- } +- return notifyOnOOM(c.cgroupManager.GetPaths()) +-} +- +-func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { +- // XXX(cyphar): This requires cgroups. +- if c.config.Rootless { +- return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") +- } +- return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) +-} +- +-var criuFeatures *criurpc.CriuFeatures +- +-func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { +- +- var t criurpc.CriuReqType +- t = criurpc.CriuReqType_FEATURE_CHECK +- +- if err := c.checkCriuVersion("1.8"); err != nil { +- // Feature checking was introduced with CRIU 1.8. +- // Ignore the feature check if an older CRIU version is used +- // and just act as before. +- // As all automated PR testing is done using CRIU 1.7 this +- // code will not be tested by automated PR testing. +- return nil +- } +- +- // make sure the features we are looking for are really not from +- // some previous check +- criuFeatures = nil +- +- req := &criurpc.CriuReq{ +- Type: &t, +- // Theoretically this should not be necessary but CRIU +- // segfaults if Opts is empty. +- // Fixed in CRIU 2.12 +- Opts: rpcOpts, +- Features: criuFeat, +- } +- +- err := c.criuSwrk(nil, req, criuOpts, false) +- if err != nil { +- logrus.Debugf("%s", err) +- return fmt.Errorf("CRIU feature check failed") +- } +- +- logrus.Debugf("Feature check says: %s", criuFeatures) +- missingFeatures := false +- +- if *criuFeat.MemTrack && !*criuFeatures.MemTrack { +- missingFeatures = true +- logrus.Debugf("CRIU does not support MemTrack") +- } +- +- if missingFeatures { +- return fmt.Errorf("CRIU is missing features") +- } +- +- return nil +-} +- +-// checkCriuVersion checks Criu version greater than or equal to minVersion +-func (c *linuxContainer) checkCriuVersion(minVersion string) error { +- var x, y, z, versionReq int +- +- _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 +- if err != nil { +- _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 +- } +- versionReq = x*10000 + y*100 + z +- +- out, err := exec.Command(c.criuPath, "-V").Output() +- if err != nil { +- return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath) +- } +- +- x = 0 +- y = 0 +- z = 0 +- if ep := strings.Index(string(out), "-"); ep >= 0 { +- // criu Git version format +- var version string +- if sp := strings.Index(string(out), "GitID"); sp > 0 { +- version = string(out)[sp:ep] +- } else { +- return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath) +- } +- +- n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 +- if err != nil { +- n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 +- y++ +- } else { +- z++ +- } +- if n < 2 || err != nil { +- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) +- } +- } else { +- // criu release version format +- n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 +- if err != nil { +- n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 +- } +- if n < 2 || err != nil { +- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) +- } +- } +- +- c.criuVersion = x*10000 + y*100 + z +- +- if c.criuVersion < versionReq { +- return fmt.Errorf("CRIU version must be %s or higher", minVersion) +- } +- +- return nil +-} +- +-const descriptorsFilename = "descriptors.json" +- +-func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { +- mountDest := m.Destination +- if strings.HasPrefix(mountDest, c.config.Rootfs) { +- mountDest = mountDest[len(c.config.Rootfs):] +- } +- +- extMnt := &criurpc.ExtMountMap{ +- Key: proto.String(mountDest), +- Val: proto.String(mountDest), +- } +- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +-} +- +-func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { +- for _, path := range c.config.MaskPaths { +- fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) +- if err != nil { +- if os.IsNotExist(err) { +- continue +- } +- return err +- } +- if fi.IsDir() { +- continue +- } +- +- extMnt := &criurpc.ExtMountMap{ +- Key: proto.String(path), +- Val: proto.String("/dev/null"), +- } +- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +- } +- +- return nil +-} +- +-func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { +- c.m.Lock() +- defer c.m.Unlock() +- +- // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has +- // support for doing unprivileged dumps, but the setup of +- // rootless containers might make this complicated. +- if c.config.Rootless { +- return fmt.Errorf("cannot checkpoint a rootless container") +- } +- +- if err := c.checkCriuVersion("1.5.2"); err != nil { +- return err +- } +- +- if criuOpts.ImagesDirectory == "" { +- return fmt.Errorf("invalid directory to save checkpoint") +- } +- +- // Since a container can be C/R'ed multiple times, +- // the checkpoint directory may already exist. +- if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) { +- return err +- } +- +- if criuOpts.WorkDirectory == "" { +- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") +- } +- +- if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) { +- return err +- } +- +- workDir, err := os.Open(criuOpts.WorkDirectory) +- if err != nil { +- return err +- } +- defer workDir.Close() +- +- imageDir, err := os.Open(criuOpts.ImagesDirectory) +- if err != nil { +- return err +- } +- defer imageDir.Close() +- +- rpcOpts := criurpc.CriuOpts{ +- ImagesDirFd: proto.Int32(int32(imageDir.Fd())), +- WorkDirFd: proto.Int32(int32(workDir.Fd())), +- LogLevel: proto.Int32(4), +- LogFile: proto.String("dump.log"), +- Root: proto.String(c.config.Rootfs), +- ManageCgroups: proto.Bool(true), +- NotifyScripts: proto.Bool(true), +- Pid: proto.Int32(int32(c.initProcess.pid())), +- ShellJob: proto.Bool(criuOpts.ShellJob), +- LeaveRunning: proto.Bool(criuOpts.LeaveRunning), +- TcpEstablished: proto.Bool(criuOpts.TcpEstablished), +- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), +- FileLocks: proto.Bool(criuOpts.FileLocks), +- EmptyNs: proto.Uint32(criuOpts.EmptyNs), +- } +- +- // append optional criu opts, e.g., page-server and port +- if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { +- rpcOpts.Ps = &criurpc.CriuPageServerInfo{ +- Address: proto.String(criuOpts.PageServer.Address), +- Port: proto.Int32(criuOpts.PageServer.Port), +- } +- } +- +- //pre-dump may need parentImage param to complete iterative migration +- if criuOpts.ParentImage != "" { +- rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) +- rpcOpts.TrackMem = proto.Bool(true) +- } +- +- // append optional manage cgroups mode +- if criuOpts.ManageCgroupsMode != 0 { +- if err := c.checkCriuVersion("1.7"); err != nil { +- return err +- } +- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) +- rpcOpts.ManageCgroupsMode = &mode +- } +- +- var t criurpc.CriuReqType +- if criuOpts.PreDump { +- feat := criurpc.CriuFeatures{ +- MemTrack: proto.Bool(true), +- } +- +- if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { +- return err +- } +- +- t = criurpc.CriuReqType_PRE_DUMP +- } else { +- t = criurpc.CriuReqType_DUMP +- } +- req := &criurpc.CriuReq{ +- Type: &t, +- Opts: &rpcOpts, +- } +- +- //no need to dump these information in pre-dump +- if !criuOpts.PreDump { +- for _, m := range c.config.Mounts { +- switch m.Device { +- case "bind": +- c.addCriuDumpMount(req, m) +- break +- case "cgroup": +- binds, err := getCgroupMounts(m) +- if err != nil { +- return err +- } +- for _, b := range binds { +- c.addCriuDumpMount(req, b) +- } +- break +- } +- } +- +- if err := c.addMaskPaths(req); err != nil { +- return err +- } +- +- for _, node := range c.config.Devices { +- m := &configs.Mount{Destination: node.Path, Source: node.Path} +- c.addCriuDumpMount(req, m) +- } +- +- // Write the FD info to a file in the image directory +- fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) +- if err != nil { +- return err +- } +- +- err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) +- if err != nil { +- return err +- } +- } +- +- err = c.criuSwrk(nil, req, criuOpts, false) +- if err != nil { +- return err +- } +- return nil +-} +- +-func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { +- mountDest := m.Destination +- if strings.HasPrefix(mountDest, c.config.Rootfs) { +- mountDest = mountDest[len(c.config.Rootfs):] +- } +- +- extMnt := &criurpc.ExtMountMap{ +- Key: proto.String(mountDest), +- Val: proto.String(m.Source), +- } +- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +-} +- +-func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { +- for _, iface := range c.config.Networks { +- switch iface.Type { +- case "veth": +- veth := new(criurpc.CriuVethPair) +- veth.IfOut = proto.String(iface.HostInterfaceName) +- veth.IfIn = proto.String(iface.Name) +- req.Opts.Veths = append(req.Opts.Veths, veth) +- break +- case "loopback": +- break +- } +- } +- for _, i := range criuOpts.VethPairs { +- veth := new(criurpc.CriuVethPair) +- veth.IfOut = proto.String(i.HostInterfaceName) +- veth.IfIn = proto.String(i.ContainerInterfaceName) +- req.Opts.Veths = append(req.Opts.Veths, veth) +- } +-} +- +-func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { +- c.m.Lock() +- defer c.m.Unlock() +- +- // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have +- // support for unprivileged restore at the moment. +- if c.config.Rootless { +- return fmt.Errorf("cannot restore a rootless container") +- } +- +- if err := c.checkCriuVersion("1.5.2"); err != nil { +- return err +- } +- if criuOpts.WorkDirectory == "" { +- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") +- } +- // Since a container can be C/R'ed multiple times, +- // the work directory may already exist. +- if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { +- return err +- } +- workDir, err := os.Open(criuOpts.WorkDirectory) +- if err != nil { +- return err +- } +- defer workDir.Close() +- if criuOpts.ImagesDirectory == "" { +- return fmt.Errorf("invalid directory to restore checkpoint") +- } +- imageDir, err := os.Open(criuOpts.ImagesDirectory) +- if err != nil { +- return err +- } +- defer imageDir.Close() +- // CRIU has a few requirements for a root directory: +- // * it must be a mount point +- // * its parent must not be overmounted +- // c.config.Rootfs is bind-mounted to a temporary directory +- // to satisfy these requirements. +- root := filepath.Join(c.root, "criu-root") +- if err := os.Mkdir(root, 0755); err != nil { +- return err +- } +- defer os.Remove(root) +- root, err = filepath.EvalSymlinks(root) +- if err != nil { +- return err +- } +- err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") +- if err != nil { +- return err +- } +- defer syscall.Unmount(root, syscall.MNT_DETACH) +- t := criurpc.CriuReqType_RESTORE +- req := &criurpc.CriuReq{ +- Type: &t, +- Opts: &criurpc.CriuOpts{ +- ImagesDirFd: proto.Int32(int32(imageDir.Fd())), +- WorkDirFd: proto.Int32(int32(workDir.Fd())), +- EvasiveDevices: proto.Bool(true), +- LogLevel: proto.Int32(4), +- LogFile: proto.String("restore.log"), +- RstSibling: proto.Bool(true), +- Root: proto.String(root), +- ManageCgroups: proto.Bool(true), +- NotifyScripts: proto.Bool(true), +- ShellJob: proto.Bool(criuOpts.ShellJob), +- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), +- TcpEstablished: proto.Bool(criuOpts.TcpEstablished), +- FileLocks: proto.Bool(criuOpts.FileLocks), +- EmptyNs: proto.Uint32(criuOpts.EmptyNs), +- }, +- } +- +- for _, m := range c.config.Mounts { +- switch m.Device { +- case "bind": +- c.addCriuRestoreMount(req, m) +- break +- case "cgroup": +- binds, err := getCgroupMounts(m) +- if err != nil { +- return err +- } +- for _, b := range binds { +- c.addCriuRestoreMount(req, b) +- } +- break +- } +- } +- +- if len(c.config.MaskPaths) > 0 { +- m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} +- c.addCriuRestoreMount(req, m) +- } +- +- for _, node := range c.config.Devices { +- m := &configs.Mount{Destination: node.Path, Source: node.Path} +- c.addCriuRestoreMount(req, m) +- } +- +- if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { +- c.restoreNetwork(req, criuOpts) +- } +- +- // append optional manage cgroups mode +- if criuOpts.ManageCgroupsMode != 0 { +- if err := c.checkCriuVersion("1.7"); err != nil { +- return err +- } +- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) +- req.Opts.ManageCgroupsMode = &mode +- } +- +- var ( +- fds []string +- fdJSON []byte +- ) +- if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { +- return err +- } +- +- if err := json.Unmarshal(fdJSON, &fds); err != nil { +- return err +- } +- for i := range fds { +- if s := fds[i]; strings.Contains(s, "pipe:") { +- inheritFd := new(criurpc.InheritFd) +- inheritFd.Key = proto.String(s) +- inheritFd.Fd = proto.Int32(int32(i)) +- req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) +- } +- } +- return c.criuSwrk(process, req, criuOpts, true) +-} +- +-func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { +- // XXX: Do we need to deal with this case? AFAIK criu still requires root. +- if err := c.cgroupManager.Apply(pid); err != nil { +- return err +- } +- +- if err := c.cgroupManager.Set(c.config); err != nil { +- return newSystemError(err) +- } +- +- path := fmt.Sprintf("/proc/%d/cgroup", pid) +- cgroupsPaths, err := cgroups.ParseCgroupFile(path) +- if err != nil { +- return err +- } +- +- for c, p := range cgroupsPaths { +- cgroupRoot := &criurpc.CgroupRoot{ +- Ctrl: proto.String(c), +- Path: proto.String(p), +- } +- req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) +- } +- +- return nil +-} +- +-func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { +- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) +- if err != nil { +- return err +- } +- +- logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) +- criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") +- criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") +- defer criuClient.Close() +- defer criuServer.Close() +- +- args := []string{"swrk", "3"} +- logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) +- logrus.Debugf("Using CRIU with following args: %s", args) +- cmd := exec.Command(c.criuPath, args...) +- if process != nil { +- cmd.Stdin = process.Stdin +- cmd.Stdout = process.Stdout +- cmd.Stderr = process.Stderr +- } +- cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) +- +- if err := cmd.Start(); err != nil { +- return err +- } +- criuServer.Close() +- +- defer func() { +- criuClient.Close() +- _, err := cmd.Process.Wait() +- if err != nil { +- return +- } +- }() +- +- if applyCgroups { +- err := c.criuApplyCgroups(cmd.Process.Pid, req) +- if err != nil { +- return err +- } +- } +- +- var extFds []string +- if process != nil { +- extFds, err = getPipeFds(cmd.Process.Pid) +- if err != nil { +- return err +- } +- } +- +- logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) +- // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() +- // should be empty. For older CRIU versions it still will be +- // available but empty. +- if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK { +- val := reflect.ValueOf(req.GetOpts()) +- v := reflect.Indirect(val) +- for i := 0; i < v.NumField(); i++ { +- st := v.Type() +- name := st.Field(i).Name +- if strings.HasPrefix(name, "XXX_") { +- continue +- } +- value := val.MethodByName("Get" + name).Call([]reflect.Value{}) +- logrus.Debugf("CRIU option %s with value %v", name, value[0]) +- } +- } +- data, err := proto.Marshal(req) +- if err != nil { +- return err +- } +- _, err = criuClient.Write(data) +- if err != nil { +- return err +- } +- +- buf := make([]byte, 10*4096) +- for true { +- n, err := criuClient.Read(buf) +- if err != nil { +- return err +- } +- if n == 0 { +- return fmt.Errorf("unexpected EOF") +- } +- if n == len(buf) { +- return fmt.Errorf("buffer is too small") +- } +- +- resp := new(criurpc.CriuResp) +- err = proto.Unmarshal(buf[:n], resp) +- if err != nil { +- return err +- } +- if !resp.GetSuccess() { +- typeString := req.GetType().String() +- return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) +- } +- +- t := resp.GetType() +- switch { +- case t == criurpc.CriuReqType_FEATURE_CHECK: +- logrus.Debugf("Feature check says: %s", resp) +- criuFeatures = resp.GetFeatures() +- break +- case t == criurpc.CriuReqType_NOTIFY: +- if err := c.criuNotifications(resp, process, opts, extFds); err != nil { +- return err +- } +- t = criurpc.CriuReqType_NOTIFY +- req = &criurpc.CriuReq{ +- Type: &t, +- NotifySuccess: proto.Bool(true), +- } +- data, err = proto.Marshal(req) +- if err != nil { +- return err +- } +- _, err = criuClient.Write(data) +- if err != nil { +- return err +- } +- continue +- case t == criurpc.CriuReqType_RESTORE: +- case t == criurpc.CriuReqType_DUMP: +- break +- case t == criurpc.CriuReqType_PRE_DUMP: +- // In pre-dump mode CRIU is in a loop and waits for +- // the final DUMP command. +- // The current runc pre-dump approach, however, is +- // start criu in PRE_DUMP once for a single pre-dump +- // and not the whole series of pre-dump, pre-dump, ...m, dump +- // If we got the message CriuReqType_PRE_DUMP it means +- // CRIU was successful and we need to forcefully stop CRIU +- logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service") +- criuClient.Close() +- // Process status won't be success, because one end of sockets is closed +- _, err := cmd.Process.Wait() +- if err != nil { +- logrus.Debugf("After PRE_DUMP CRIU exiting failed") +- return err +- } +- return nil +- default: +- return fmt.Errorf("unable to parse the response %s", resp.String()) +- } +- +- break +- } +- +- // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. +- // Here we want to wait only the CRIU process. +- st, err := cmd.Process.Wait() +- if err != nil { +- return err +- } +- if !st.Success() { +- return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) +- } +- return nil +-} +- +-// block any external network activity +-func lockNetwork(config *configs.Config) error { +- for _, config := range config.Networks { +- strategy, err := getStrategy(config.Type) +- if err != nil { +- return err +- } +- +- if err := strategy.detach(config); err != nil { +- return err +- } +- } +- return nil +-} +- +-func unlockNetwork(config *configs.Config) error { +- for _, config := range config.Networks { +- strategy, err := getStrategy(config.Type) +- if err != nil { +- return err +- } +- if err = strategy.attach(config); err != nil { +- return err +- } +- } +- return nil +-} +- +-func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error { +- notify := resp.GetNotify() +- if notify == nil { +- return fmt.Errorf("invalid response: %s", resp.String()) +- } +- switch { +- case notify.GetScript() == "post-dump": +- f, err := os.Create(filepath.Join(c.root, "checkpoint")) +- if err != nil { +- return err +- } +- f.Close() +- case notify.GetScript() == "network-unlock": +- if err := unlockNetwork(c.config); err != nil { +- return err +- } +- case notify.GetScript() == "network-lock": +- if err := lockNetwork(c.config); err != nil { +- return err +- } +- case notify.GetScript() == "setup-namespaces": +- if c.config.Hooks != nil { +- s := configs.HookState{ +- SpecState: configs.SpecState{ +- Version: c.config.Version, +- ID: c.id, +- Pid: int(notify.GetPid()), +- Bundle: utils.SearchLabels(c.config.Labels, "bundle"), +- }, +- Root: c.config.Rootfs, +- } +- for i, hook := range c.config.Hooks.Prestart { +- logrus.Infof("run prestart hook: %d:%s, ContainerID: %s", i, hook.Info(), s.ID) +- if err := hook.Run(s); err != nil { +- return newSystemErrorWithCausef(err, "running prestart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID) +- } +- logrus.Infof("prestart hook: %d:%s done", i, hook.Info()) +- } +- } +- case notify.GetScript() == "post-restore": +- pid := notify.GetPid() +- r, err := newRestoredProcess(int(pid), fds) +- if err != nil { +- return err +- } +- process.ops = r +- if err := c.state.transition(&restoredState{ +- imageDir: opts.ImagesDirectory, +- c: c, +- }); err != nil { +- return err +- } +- // create a timestamp indicating when the restored checkpoint was started +- c.created = time.Now().UTC() +- if _, err := c.updateState(r); err != nil { +- return err +- } +- if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { +- if !os.IsNotExist(err) { +- logrus.Error(err) +- } +- } +- } +- return nil +-} +- +-func (c *linuxContainer) updateState(process parentProcess) (*State, error) { +- c.initProcess = process +- state, err := c.currentState() +- if err != nil { +- return nil, err +- } +- err = c.saveState(state) +- if err != nil { +- return nil, err +- } +- return state, nil +-} +- +-func (c *linuxContainer) saveState(s *State) error { +- f, err := os.Create(filepath.Join(c.root, stateFilename)) +- if err != nil { +- return err +- } +- defer f.Close() +- return utils.WriteJSON(f, s) +-} +- +-func (c *linuxContainer) deleteState() error { +- return os.Remove(filepath.Join(c.root, stateFilename)) +-} +- +-func (c *linuxContainer) currentStatus() (Status, error) { +- if err := c.refreshState(); err != nil { +- return -1, err +- } +- return c.state.status(), nil +-} +- +-// refreshState needs to be called to verify that the current state on the +-// container is what is true. Because consumers of libcontainer can use it +-// out of process we need to verify the container's status based on runtime +-// information and not rely on our in process info. +-func (c *linuxContainer) refreshState() error { +- paused, err := c.isPaused() +- if err != nil { +- return err +- } +- if paused { +- return c.state.transition(&pausedState{c: c}) +- } +- t, err := c.runType() +- if err != nil { +- return err +- } +- switch t { +- case Created: +- return c.state.transition(&createdState{c: c}) +- case Running: +- return c.state.transition(&runningState{c: c}) +- } +- return c.state.transition(&stoppedState{c: c}) +-} +- +-// doesInitProcessExist checks if the init process is still the same process +-// as the initial one, it could happen that the original process has exited +-// and a new process has been created with the same pid, in this case, the +-// container would already be stopped. +-func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) { +- startTime, err := system.GetProcessStartTime(initPid) +- if err != nil { +- return false, nil +- } +- if c.initProcessStartTime != startTime { +- return false, nil +- } +- return true, nil +-} +- +-func (c *linuxContainer) runType() (Status, error) { +- if c.initProcess == nil { +- return Stopped, nil +- } +- pid := c.initProcess.pid() +- // return Running if the init process is alive +- if err := syscall.Kill(pid, 0); err != nil { +- if err == syscall.ESRCH { +- // It means the process does not exist anymore, could happen when the +- // process exited just when we call the function, we should not return +- // error in this case. +- return Stopped, nil +- } +- return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) +- } +- // check if the process is still the original init process. +- exist, err := c.doesInitProcessExist(pid) +- if !exist || err != nil { +- return Stopped, err +- } +- // We'll create exec fifo and blocking on it after container is created, +- // and delete it after start container. +- if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { +- return Created, nil +- } +- return Running, nil +-} +- +-func (c *linuxContainer) isPaused() (bool, error) { +- fcg := c.cgroupManager.GetPaths()["freezer"] +- if fcg == "" { +- // A container doesn't have a freezer cgroup +- return false, nil +- } +- data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state")) +- if err != nil { +- // If freezer cgroup is not mounted, the container would just be not paused. +- if os.IsNotExist(err) { +- return false, nil +- } +- return false, newSystemErrorWithCause(err, "checking if container is paused") +- } +- return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil +-} +- +-func (c *linuxContainer) currentState() (*State, error) { +- var ( +- startTime string +- externalDescriptors []string +- pid = -1 +- ) +- if c.initProcess != nil { +- pid = c.initProcess.pid() +- startTime, _ = c.initProcess.startTime() +- externalDescriptors = c.initProcess.externalDescriptors() +- } +- state := &State{ +- BaseState: BaseState{ +- ID: c.ID(), +- Config: *c.config, +- InitProcessPid: pid, +- InitProcessStartTime: startTime, +- Created: c.created, +- }, +- Rootless: c.config.Rootless, +- CgroupPaths: c.cgroupManager.GetPaths(), +- NamespacePaths: make(map[configs.NamespaceType]string), +- ExternalDescriptors: externalDescriptors, +- } +- if pid > 0 { +- for _, ns := range c.config.Namespaces { +- state.NamespacePaths[ns.Type] = ns.GetPath(pid) +- } +- for _, nsType := range configs.NamespaceTypes() { +- if !configs.IsNamespaceSupported(nsType) { +- continue +- } +- if _, ok := state.NamespacePaths[nsType]; !ok { +- ns := configs.Namespace{Type: nsType} +- state.NamespacePaths[ns.Type] = ns.GetPath(pid) +- } +- } +- } +- return state, nil +-} +- +-// orderNamespacePaths sorts namespace paths into a list of paths that we +-// can setns in order. +-func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { +- paths := []string{} +- +- for _, ns := range configs.NamespaceTypes() { +- +- // Remove namespaces that we don't need to join. +- if !c.config.Namespaces.Contains(ns) { +- continue +- } +- +- if p, ok := namespaces[ns]; ok && p != "" { +- // check if the requested namespace is supported +- if !configs.IsNamespaceSupported(ns) { +- return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) +- } +- // only set to join this namespace if it exists +- if _, err := os.Lstat(p); err != nil { +- return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) +- } +- // do not allow namespace path with comma as we use it to separate +- // the namespace paths +- if strings.ContainsRune(p, ',') { +- return nil, newSystemError(fmt.Errorf("invalid path %s", p)) +- } +- paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) +- } +- +- } +- +- return paths, nil +-} +- +-func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { +- data := bytes.NewBuffer(nil) +- for _, im := range idMap { +- line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) +- if _, err := data.WriteString(line); err != nil { +- return nil, err +- } +- } +- return data.Bytes(), nil +-} +- +-// bootstrapData encodes the necessary data in netlink binary format +-// as a io.Reader. +-// Consumer can write the data to a bootstrap program +-// such as one that uses nsenter package to bootstrap the container's +-// init process correctly, i.e. with correct namespaces, uid/gid +-// mapping etc. +-func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { +- // create the netlink message +- r := nl.NewNetlinkRequest(int(InitMsg), 0) +- +- // write cloneFlags +- r.AddData(&Int32msg{ +- Type: CloneFlagsAttr, +- Value: uint32(cloneFlags), +- }) +- +- // write custom namespace paths +- if len(nsMaps) > 0 { +- nsPaths, err := c.orderNamespacePaths(nsMaps) +- if err != nil { +- return nil, err +- } +- r.AddData(&Bytemsg{ +- Type: NsPathsAttr, +- Value: []byte(strings.Join(nsPaths, ",")), +- }) +- } +- +- // write namespace paths only when we are not joining an existing user ns +- _, joinExistingUser := nsMaps[configs.NEWUSER] +- if !joinExistingUser { +- // write uid mappings +- if len(c.config.UidMappings) > 0 { +- b, err := encodeIDMapping(c.config.UidMappings) +- if err != nil { +- return nil, err +- } +- r.AddData(&Bytemsg{ +- Type: UidmapAttr, +- Value: b, +- }) +- } +- +- // write gid mappings +- if len(c.config.GidMappings) > 0 { +- b, err := encodeIDMapping(c.config.GidMappings) +- if err != nil { +- return nil, err +- } +- r.AddData(&Bytemsg{ +- Type: GidmapAttr, +- Value: b, +- }) +- // The following only applies if we are root. +- if !c.config.Rootless { +- // check if we have CAP_SETGID to setgroup properly +- pid, err := capability.NewPid(os.Getpid()) +- if err != nil { +- return nil, err +- } +- if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { +- r.AddData(&Boolmsg{ +- Type: SetgroupAttr, +- Value: true, +- }) +- } +- } +- } +- } +- +- // write oom_score_adj +- r.AddData(&Bytemsg{ +- Type: OomScoreAdjAttr, +- Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), +- }) +- +- // write rootless +- r.AddData(&Boolmsg{ +- Type: RootlessAttr, +- Value: c.config.Rootless, +- }) +- +- return bytes.NewReader(r.Serialize()), nil +-} +diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go +index 0b2aa74..15ba017 100644 +--- a/libcontainer/factory_linux.go ++++ b/libcontainer/factory_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string { + // This is a low level implementation detail of the reexec and should not be consumed externally + func (l *LinuxFactory) StartInitialization() (err error) { + var ( +- pipefd, rootfd int ++ pipefd, fifofd int + consoleSocket *os.File + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") +- envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") ++ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") + envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") + ) + +@@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { + ) + defer pipe.Close() + +- // Only init processes have STATEDIR. +- rootfd = -1 ++ // Only init processes have FIFOFD. ++ fifofd = -1 + if it == initStandard { +- if rootfd, err = strconv.Atoi(envStateDir); err != nil { +- return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) ++ if fifofd, err = strconv.Atoi(envFifoFd); err != nil { ++ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) + } + } + +@@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { + } + }() + +- i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd) ++ i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd) + if err != nil { + return err + } +diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go +index e9a83e9..fd417ca 100644 +--- a/libcontainer/init_linux.go ++++ b/libcontainer/init_linux.go +@@ -1,18 +1,23 @@ ++//go:build linux + // +build linux + + package libcontainer + + import ( + "encoding/json" ++ "errors" + "fmt" + "io" + "net" + "os" ++ "path/filepath" + "strings" + "syscall" + "unsafe" + + "github.com/Sirupsen/logrus" ++ "golang.org/x/sys/unix" ++ + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +@@ -66,7 +71,7 @@ type initer interface { + Init() error + } + +-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) { ++func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err +@@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi + consoleSocket: consoleSocket, + parentPid: syscall.Getppid(), + config: config, +- stateDirFD: stateDirFD, ++ fifoFd: fifoFd, + logFd: logFd, + }, nil + } +@@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error { + return nil + } + ++// verifyCwd ensures that the current directory is actually inside the mount ++// namespace root of the current process. ++func verifyCwd() error { ++ // getcwd(2) on Linux detects if cwd is outside of the rootfs of the ++ // current mount namespace root, and in that case prefixes "(unreachable)" ++ // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect ++ // when this happens and return ENOENT rather than returning a non-absolute ++ // path. In both cases we can therefore easily detect if we have an invalid ++ // cwd by checking the return value of getcwd(3). See getcwd(3) for more ++ // details, and CVE-2024-21626 for the security issue that motivated this ++ // check. ++ // ++ // We have to use unix.Getwd() here because os.Getwd() has a workaround for ++ // $PWD which involves doing stat(.), which can fail if the current ++ // directory is inaccessible to the container process. ++ if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { ++ return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") ++ } else if err != nil { ++ return fmt.Errorf("failed to verify if current working directory is safe: %w", err) ++ } else if !filepath.IsAbs(wd) { ++ // We shouldn't ever hit this, but check just in case. ++ return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) ++ } ++ return nil ++} ++ + // finalizeNamespace drops the caps, sets the correct user + // and working dir, and closes any leaked file descriptors + // before executing the command inside the namespace +@@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error { + if err := setupUser(config); err != nil { + return err + } ++ // Make sure our final working directory is inside the container. ++ if err := verifyCwd(); err != nil { ++ return err ++ } + if err := system.ClearKeepCaps(); err != nil { + return err + } +diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go +index 5cdc30c..e786419 100644 +--- a/libcontainer/process_linux.go ++++ b/libcontainer/process_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -204,7 +205,6 @@ type initProcess struct { + process *Process + bootstrapData io.Reader + sharePidns bool +- rootDir *os.File + } + + func (p *initProcess) pid() int { +@@ -257,7 +257,6 @@ func (p *initProcess) start() error { + err := p.cmd.Start() + p.process.ops = p + p.childPipe.Close() +- p.rootDir.Close() + logs.CloseChild() + if err != nil { + p.process.ops = nil +diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go +index 1f7ec98..e38165d 100644 +--- a/libcontainer/setns_init_linux.go ++++ b/libcontainer/setns_init_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -73,5 +74,23 @@ func (l *linuxSetnsInit) Init() error { + syscall.Close(l.logFd) + } + ++ // Close all file descriptors we are not passing to the container. This is ++ // necessary because the execve target could use internal runc fds as the ++ // execve path, potentially giving access to binary files from the host ++ // (which can then be opened by container processes, leading to container ++ // escapes). Note that because this operation will close any open file ++ // descriptors that are referenced by (*os.File) handles from underneath ++ // the Go runtime, we must not do any file operations after this point ++ // (otherwise the (*os.File) finaliser could close the wrong file). See ++ // CVE-2024-21626 for more information as to why this protection is ++ // necessary. ++ // ++ // This is not needed for runc-dmz, because the extra execve(2) step means ++ // that all O_CLOEXEC file descriptors have already been closed and thus ++ // the second execve(2) from runc-dmz cannot access internal file ++ // descriptors from runc. ++ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { ++ return err ++ } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + } +diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go +index 6236593..7ebf1a2 100644 +--- a/libcontainer/standard_init_linux.go ++++ b/libcontainer/standard_init_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -15,14 +16,17 @@ import ( + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" ++ "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux/label" ++ ++ "golang.org/x/sys/unix" + ) + + type linuxStandardInit struct { + pipe *os.File + consoleSocket *os.File + parentPid int +- stateDirFD int ++ fifoFd int + config *initConfig + logFd int + } +@@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error { + // exec'ing the users process. + ch := make(chan Error, 1) + go func() { +- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) ++ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) + if err != nil { + ch <- newSystemErrorWithCause(err, "openat exec fifo") + return +@@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error { + } + // close the statedir fd before exec because the kernel resets dumpable in the wrong order + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 +- syscall.Close(l.stateDirFD) ++ unix.Close(l.fifoFd) ++ // Close all file descriptors we are not passing to the container. This is ++ // necessary because the execve target could use internal runc fds as the ++ // execve path, potentially giving access to binary files from the host ++ // (which can then be opened by container processes, leading to container ++ // escapes). Note that because this operation will close any open file ++ // descriptors that are referenced by (*os.File) handles from underneath ++ // the Go runtime, we must not do any file operations after this point ++ // (otherwise the (*os.File) finaliser could close the wrong file). See ++ // CVE-2024-21626 for more information as to why this protection is ++ // necessary. ++ // ++ // This is not needed for runc-dmz, because the extra execve(2) step means ++ // that all O_CLOEXEC file descriptors have already been closed and thus ++ // the second execve(2) from runc-dmz cannot access internal file ++ // descriptors from runc. ++ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { ++ return err ++ } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } +diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig +deleted file mode 100644 +index 611b91d..0000000 +--- a/libcontainer/standard_init_linux.go.orig ++++ /dev/null +@@ -1,223 +0,0 @@ +-// +build linux +- +-package libcontainer +- +-import ( +- "fmt" +- "os" +- "os/exec" +- "strings" +- "syscall" +- "time" +- +- "github.com/opencontainers/runc/libcontainer/apparmor" +- "github.com/opencontainers/runc/libcontainer/configs" +- "github.com/opencontainers/runc/libcontainer/keys" +- "github.com/opencontainers/runc/libcontainer/seccomp" +- "github.com/opencontainers/runc/libcontainer/system" +- "github.com/opencontainers/selinux/go-selinux/label" +-) +- +-type linuxStandardInit struct { +- pipe *os.File +- consoleSocket *os.File +- parentPid int +- stateDirFD int +- config *initConfig +- logFd int +-} +- +-func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { +- var newperms uint32 +- +- if l.config.Config.Namespaces.Contains(configs.NEWUSER) { +- // with user ns we need 'other' search permissions +- newperms = 0x8 +- } else { +- // without user ns we need 'UID' search permissions +- newperms = 0x80000 +- } +- +- // create a unique per session container name that we can +- // join in setns; however, other containers can also join it +- return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +-} +- +-// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value +-// the kernel +-const PR_SET_NO_NEW_PRIVS = 0x26 +- +-func (l *linuxStandardInit) Init() error { +- if !l.config.Config.NoNewKeyring { +- ringname, keepperms, newperms := l.getSessionRingParams() +- +- // do not inherit the parent's session keyring +- sessKeyId, err := keys.JoinSessionKeyring(ringname) +- if err != nil { +- return err +- } +- // make session keyring searcheable +- if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { +- return err +- } +- } +- +- if err := setupNetwork(l.config); err != nil { +- return err +- } +- if err := setupRoute(l.config.Config); err != nil { +- return err +- } +- +- label.Init() +- +- // prepareRootfs() can be executed only for a new mount namespace. +- if l.config.Config.Namespaces.Contains(configs.NEWNS) { +- if err := prepareRootfs(l.pipe, l.config.Config); err != nil { +- return err +- } +- } +- +- // Set up the console. This has to be done *before* we finalize the rootfs, +- // but *after* we've given the user the chance to set up all of the mounts +- // they wanted. +- if l.config.CreateConsole { +- if err := setupConsole(l.consoleSocket, l.config, true); err != nil { +- return err +- } +- if err := system.Setctty(); err != nil { +- return err +- } +- } +- +- // Finish the rootfs setup. +- if l.config.Config.Namespaces.Contains(configs.NEWNS) { +- if err := finalizeRootfs(l.config.Config); err != nil { +- return err +- } +- } +- +- if hostname := l.config.Config.Hostname; hostname != "" { +- if err := syscall.Sethostname([]byte(hostname)); err != nil { +- return err +- } +- } +- if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { +- return err +- } +- if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { +- return err +- } +- // when userns enabled, write to sysctl will fail, let docker-hooks do this job +- if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 { +- for key, value := range l.config.Config.Sysctl { +- if err := writeSystemProperty(key, value); err != nil { +- return err +- } +- } +- } +- for _, path := range l.config.Config.ReadonlyPaths { +- if err := readonlyPath(path); err != nil { +- return err +- } +- } +- for _, m := range l.config.Config.Mounts { +- if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") { +- if err := remountReadWrite(m.Destination); err != nil { +- return err +- } +- } +- } +- for _, path := range l.config.Config.MaskPaths { +- if err := maskPath(path); err != nil { +- return err +- } +- } +- pdeath, err := system.GetParentDeathSignal() +- if err != nil { +- return err +- } +- if l.config.NoNewPrivileges { +- if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { +- return err +- } +- } +- // Tell our parent that we're ready to Execv. This must be done before the +- // Seccomp rules have been applied, because we need to be able to read and +- // write to a socket. +- if err := syncParentReady(l.pipe); err != nil { +- return err +- } +- // Without NoNewPrivileges seccomp is a privileged operation, so we need to +- // do this before dropping capabilities; otherwise do it as late as possible +- // just before execve so as few syscalls take place after it as possible. +- if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { +- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { +- return err +- } +- } +- if err := finalizeNamespace(l.config); err != nil { +- return err +- } +- // finalizeNamespace can change user/group which clears the parent death +- // signal, so we restore it here. +- if err := pdeath.Restore(); err != nil { +- return err +- } +- // compare the parent from the initial start of the init process and make sure that it did not change. +- // if the parent changes that means it died and we were reparented to something else so we should +- // just kill ourself and not cause problems for someone else. +- if syscall.Getppid() != l.parentPid { +- return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) +- } +- // check for the arg before waiting to make sure it exists and it is returned +- // as a create time error. +- name, err := exec.LookPath(l.config.Args[0]) +- if err != nil { +- return err +- } +- // close the pipe to signal that we have completed our init. +- l.pipe.Close() +- +- if l.logFd != 0 { +- syscall.Close(l.logFd) +- } +- +- // wait for the fifo to be opened on the other side before +- // exec'ing the users process. +- ch := make(chan Error, 1) +- go func() { +- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) +- if err != nil { +- ch <- newSystemErrorWithCause(err, "openat exec fifo") +- return +- } +- if _, err := syscall.Write(fd, []byte("0")); err != nil { +- ch <- newSystemErrorWithCause(err, "write 0 exec fifo") +- return +- } +- ch <- nil +- }() +- +- select { +- case chErr := <-ch: +- if chErr != nil { +- return chErr +- } +- case <-time.After(120 * time.Second): +- return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ") +- } +- +- if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { +- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { +- return newSystemErrorWithCause(err, "init seccomp") +- } +- } +- // close the statedir fd before exec because the kernel resets dumpable in the wrong order +- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 +- syscall.Close(l.stateDirFD) +- if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { +- return newSystemErrorWithCause(err, "exec user process") +- } +- return nil +-} +diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go +index cd04ace..922cffb 100644 +--- a/libcontainer/utils/utils.go ++++ b/libcontainer/utils/utils.go +@@ -5,17 +5,12 @@ import ( + "encoding/binary" + "encoding/hex" + "encoding/json" +- "fmt" + "io" + "os" + "path/filepath" +- "strconv" + "strings" + "syscall" + "unsafe" +- +- securejoin "github.com/cyphar/filepath-securejoin" +- "golang.org/x/sys/unix" + ) + + const ( +@@ -175,36 +170,3 @@ func stripRoot(root, path string) string { + } + return CleanPath("/" + path) + } +- +-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +-// corresponding to the unsafePath resolved within the root. Before passing the +-// fd, this path is verified to have been inside the root -- so operating on it +-// through the passed fdpath should be safe. Do not access this path through +-// the original path strings, and do not attempt to use the pathname outside of +-// the passed closure (the file handle will be freed once the closure returns). +-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { +- // Remove the root then forcefully resolve inside the root. +- unsafePath = stripRoot(root, unsafePath) +- path, err := securejoin.SecureJoin(root, unsafePath) +- if err != nil { +- return fmt.Errorf("resolving path inside rootfs failed: %v", err) +- } +- +- // Open the target path. +- fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) +- if err != nil { +- return fmt.Errorf("open o_path procfd: %w", err) +- } +- defer fh.Close() +- +- // Double-check the path is the one we expected. +- procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) +- if realpath, err := os.Readlink(procfd); err != nil { +- return fmt.Errorf("procfd verification failed: %w", err) +- } else if realpath != path { +- return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) +- } +- +- // Run the closure. +- return fn(procfd) +-} +diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go +index 7b798cc..cfacfc2 100644 +--- a/libcontainer/utils/utils_unix.go ++++ b/libcontainer/utils/utils_unix.go +@@ -1,43 +1,264 @@ ++///go:build !windows ++//go:build !windows + // +build !windows + + package utils + + import ( +- "io/ioutil" ++ "fmt" ++ "math" + "os" ++ "path/filepath" ++ "runtime" + "strconv" +- "syscall" ++ "sync" ++ _ "unsafe" // for go:linkname ++ ++ securejoin "github.com/cyphar/filepath-securejoin" ++ "github.com/Sirupsen/logrus" ++ "golang.org/x/sys/unix" + ) + +-func CloseExecFrom(minFd int) error { +- fdList, err := ioutil.ReadDir("/proc/self/fd") ++// EnsureProcHandle returns whether or not the given file handle is on procfs. ++func EnsureProcHandle(fh *os.File) error { ++ var buf unix.Statfs_t ++ if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { ++ return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) ++ } ++ if buf.Type != unix.PROC_SUPER_MAGIC { ++ return fmt.Errorf("%s is not on procfs", fh.Name()) ++ } ++ return nil ++} ++ ++var ( ++ haveCloseRangeCloexecBool bool ++ haveCloseRangeCloexecOnce sync.Once ++) ++ ++func haveCloseRangeCloexec() bool { ++ haveCloseRangeCloexecOnce.Do(func() { ++ // Make sure we're not closing a random file descriptor. ++ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) ++ if err != nil { ++ return ++ } ++ defer unix.Close(tmpFd) ++ ++ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) ++ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). ++ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any ++ // other potential error would imply that even the most basic close ++ // operation wouldn't work. ++ haveCloseRangeCloexecBool = err == nil ++ }) ++ return haveCloseRangeCloexecBool ++} ++ ++type fdFunc func(fd int) ++ ++// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in ++// the current process. ++func fdRangeFrom(minFd int, fn fdFunc) error { ++ procSelfFd, closer := ProcThreadSelf("fd") ++ defer closer() ++ ++ fdDir, err := os.Open(procSelfFd) ++ if err != nil { ++ return err ++ } ++ defer fdDir.Close() ++ ++ if err := EnsureProcHandle(fdDir); err != nil { ++ return err ++ } ++ ++ fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } +- for _, fi := range fdList { +- fd, err := strconv.Atoi(fi.Name()) ++ for _, fdStr := range fdList { ++ fd, err := strconv.Atoi(fdStr) ++ // Ignore non-numeric file names. + if err != nil { +- // ignore non-numeric file names + continue + } +- ++ // Ignore descriptors lower than our specified minimum. + if fd < minFd { +- // ignore descriptors lower than our specified minimum + continue + } +- +- // intentionally ignore errors from syscall.CloseOnExec +- syscall.CloseOnExec(fd) +- // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) ++ // Ignore the file descriptor we used for readdir, as it will be closed ++ // when we return. ++ if uintptr(fd) == fdDir.Fd() { ++ continue ++ } ++ // Run the closure. ++ fn(fd) + } + return nil + } + +-// NewSockPair returns a new unix socket pair +-func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) ++// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or ++// equal to minFd in the current process. ++func CloseExecFrom(minFd int) error { ++ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. ++ if haveCloseRangeCloexec() { ++ err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC) ++ return os.NewSyscallError("close_range", err) ++ } ++ // Otherwise, fall back to the standard loop. ++ return fdRangeFrom(minFd, unix.CloseOnExec) ++} ++ ++//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor ++ ++// In order to make sure we do not close the internal epoll descriptors the Go ++// runtime uses, we need to ensure that we skip descriptors that match ++// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, ++// unfortunately there's no other way to be sure we're only keeping the file ++// descriptors the Go runtime needs. Hopefully nothing blows up doing this... ++func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive ++ ++// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the ++// current process, except for those critical to Go's runtime (such as the ++// netpoll management descriptors). ++// ++// NOTE: That this function is incredibly dangerous to use in most Go code, as ++// closing file descriptors from underneath *os.File handles can lead to very ++// bad behaviour (the closed file descriptor can be re-used and then any ++// *os.File operations would apply to the wrong file). This function is only ++// intended to be called from the last stage of runc init. ++func UnsafeCloseFrom(minFd int) error { ++ // We cannot use close_range(2) even if it is available, because we must ++ // not close some file descriptors. ++ return fdRangeFrom(minFd, func(fd int) { ++ if runtime_IsPollDescriptor(uintptr(fd)) { ++ // These are the Go runtimes internal netpoll file descriptors. ++ // These file descriptors are operated on deep in the Go scheduler, ++ // and closing those files from underneath Go can result in panics. ++ // There is no issue with keeping them because they are not ++ // executable and are not useful to an attacker anyway. Also we ++ // don't have any choice. ++ return ++ } ++ // There's nothing we can do about errors from close(2), and the ++ // only likely error to be seen is EBADF which indicates the fd was ++ // already closed (in which case, we got what we wanted). ++ _ = unix.Close(fd) ++ }) ++} ++ ++// NewSockPair returns a new SOCK_STREAM unix socket pair. ++func NewSockPair(name string) (parent, child *os.File, err error) { ++ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil + } ++ ++// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) ++// corresponding to the unsafePath resolved within the root. Before passing the ++// fd, this path is verified to have been inside the root -- so operating on it ++// through the passed fdpath should be safe. Do not access this path through ++// the original path strings, and do not attempt to use the pathname outside of ++// the passed closure (the file handle will be freed once the closure returns). ++func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { ++ // Remove the root then forcefully resolve inside the root. ++ unsafePath = stripRoot(root, unsafePath) ++ path, err := securejoin.SecureJoin(root, unsafePath) ++ if err != nil { ++ return fmt.Errorf("resolving path inside rootfs failed: %w", err) ++ } ++ ++ procSelfFd, closer := ProcThreadSelf("fd/") ++ defer closer() ++ ++ // Open the target path. ++ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) ++ if err != nil { ++ return fmt.Errorf("open o_path procfd: %w", err) ++ } ++ defer fh.Close() ++ ++ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) ++ // Double-check the path is the one we expected. ++ if realpath, err := os.Readlink(procfd); err != nil { ++ return fmt.Errorf("procfd verification failed: %w", err) ++ } else if realpath != path { ++ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) ++ } ++ ++ return fn(procfd) ++} ++ ++type ProcThreadSelfCloser func() ++ ++var ( ++ haveProcThreadSelf bool ++ haveProcThreadSelfOnce sync.Once ++) ++ ++// ProcThreadSelf returns a string that is equivalent to ++// /proc/thread-self/, with a graceful fallback on older kernels where ++// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, ++// meaning that the passed string needs to be trusted. The caller _must_ call ++// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) ++// *only once* after it has finished using the returned path string. ++func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { ++ haveProcThreadSelfOnce.Do(func() { ++ if _, err := os.Stat("/proc/thread-self/"); err == nil { ++ haveProcThreadSelf = true ++ } else { ++ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) ++ } ++ }) ++ ++ // We need to lock our thread until the caller is done with the path string ++ // because any non-atomic operation on the path (such as opening a file, ++ // then reading it) could be interrupted by the Go runtime where the ++ // underlying thread is swapped out and the original thread is killed, ++ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In ++ // addition, the pre-3.17 fallback makes everything non-atomic because the ++ // same thing could happen between unix.Gettid() and the path operations. ++ // ++ // In theory, we don't need to lock in the atomic user case when using ++ // /proc/thread-self/, but it's better to be safe than sorry (and there are ++ // only one or two truly atomic users of /proc/thread-self/). ++ runtime.LockOSThread() ++ ++ threadSelf := "/proc/thread-self/" ++ if !haveProcThreadSelf { ++ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. ++ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" ++ if _, err := os.Stat(threadSelf); err != nil { ++ // Unfortunately, this code is called from rootfs_linux.go where we ++ // are running inside the pid namespace of the container but /proc ++ // is the host's procfs. Unfortunately there is no real way to get ++ // the correct tid to use here (the kernel age means we cannot do ++ // things like set up a private fsopen("proc") -- even scanning ++ // NSpid in all of the tasks in /proc/self/task/*/status requires ++ // Linux 4.1). ++ // ++ // So, we just have to assume that /proc/self is acceptable in this ++ // one specific case. ++ if os.Getpid() == 1 { ++ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) ++ } else { ++ // This should never happen, but the fallback should work in most cases... ++ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) ++ } ++ threadSelf = "/proc/self/" ++ } ++ } ++ return threadSelf + subpath, runtime.UnlockOSThread ++} ++ ++// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to ++// create a /proc/thread-self handle for given file descriptor. ++// ++// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but ++// without using fmt.Sprintf to avoid unneeded overhead. ++func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { ++ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) ++} +diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go +index ce67a59..e8d1081 100644 +--- a/vendor/golang.org/x/sys/unix/flock.go ++++ b/vendor/golang.org/x/sys/unix/flock.go +@@ -14,6 +14,11 @@ import "unsafe" + // systems by flock_linux_32bit.go to be SYS_FCNTL64. + var fcntl64Syscall uintptr = SYS_FCNTL + ++// FcntlInt performs a fcntl syscall on fd with the provided command and argument. ++func FcntlInt(fd uintptr, cmd, arg int) (int, error) { ++ return fcntl(int(fd), cmd, arg) ++} ++ + // FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command. + func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error { + _, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk))) +diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +index f21dcd9..e1bde81 100644 +--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +@@ -934,6 +934,7 @@ const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 ++ PROC_SUPER_MAGIC = 0x9fa0 + PROT_EXEC = 0x4 + PROT_GROWSDOWN = 0x1000000 + PROT_GROWSUP = 0x2000000 +diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +index 16a18f5..388d1fc 100644 +--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +@@ -966,6 +966,7 @@ const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 ++ PROC_SUPER_MAGIC = 0x9fa0 + PROT_EXEC = 0x4 + PROT_GROWSDOWN = 0x1000000 + PROT_GROWSUP = 0x2000000 +diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +index 8b2e87d..fe21f83 100644 +--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +@@ -312,6 +312,16 @@ func Close(fd int) (err error) { + + // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + ++func CloseRange(first uint, last uint, flags uint) (err error) { ++ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) ++ if e1 != 0 { ++ err = errnoErr(e1) ++ } ++ return ++} ++ ++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT ++ + func Dup(oldfd int) (fd int, err error) { + r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) + fd = int(r0) +diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +index f6cc320..395e2de 100644 +--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +@@ -312,6 +312,16 @@ func Close(fd int) (err error) { + + // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + ++func CloseRange(first uint, last uint, flags uint) (err error) { ++ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) ++ if e1 != 0 { ++ err = errnoErr(e1) ++ } ++ return ++} ++ ++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT ++ + func Dup(oldfd int) (fd int, err error) { + r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) + fd = int(r0) +diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +index 9042317..f7c427c 100644 +--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +@@ -338,4 +338,5 @@ const ( + SYS_PKEY_MPROTECT = 329 + SYS_PKEY_ALLOC = 330 + SYS_PKEY_FREE = 331 ++ SYS_CLOSE_RANGE = 436 + ) +diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +index 90e43d0..530563a 100644 +--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +@@ -282,4 +282,5 @@ const ( + SYS_PKEY_MPROTECT = 288 + SYS_PKEY_ALLOC = 289 + SYS_PKEY_FREE = 290 ++ SYS_CLOSE_RANGE = 436 + ) +diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +index c9e1e64..2f12811 100644 +--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +@@ -345,6 +345,11 @@ type TCPInfo struct { + Total_retrans uint32 + } + ++const ( ++ CLOSE_RANGE_UNSHARE = 0x2 ++ CLOSE_RANGE_CLOEXEC = 0x4 ++) ++ + const ( + SizeofSockaddrInet4 = 0x10 + SizeofSockaddrInet6 = 0x1c +diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +index e58c500..b77eceb 100644 +--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +@@ -30,6 +30,11 @@ type Timeval struct { + Usec int64 + } + ++const ( ++ CLOSE_RANGE_UNSHARE = 0x2 ++ CLOSE_RANGE_CLOEXEC = 0x4 ++) ++ + type Timex struct { + Modes uint32 + Pad_cgo_0 [4]byte +-- +2.33.0 + diff --git a/runc.spec b/runc.spec index d553193..68b20d9 100644 --- a/runc.spec +++ b/runc.spec @@ -2,7 +2,7 @@ Name: docker-runc Version: 1.0.0.rc3 -Release: 222 +Release: 223 Summary: runc is a CLI tool for spawning and running containers according to the OCI specification. License: ASL 2.0 @@ -41,6 +41,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc %{_bindir}/runc %changelog +* Thu Feb 1 2024 zhongjiawei - 1.0.0.rc3-223 +- Type:CVE +- CVE:CVE-2024-21626 +- SUG:NA +- DESC:fix RootDir fd leaks + * Fri Dec 8 2023 zhongjiawei - 1.0.0.rc3-222 - Type:bugfix - CVE:NA diff --git a/series.conf b/series.conf index 07b2e76..f9e40a3 100644 --- a/series.conf +++ b/series.conf @@ -139,3 +139,4 @@ 0145-runc-libcontainer-create-Cwd-when-it-does-not-exist.patch 0146-runc-delete-do-not-ignore-error-from-destroy.patch 0147-runc-libct-Destroy-don-t-proceed-in-case-of-errors.patch +0148-runc-fix-CVE-2024-21626.patch -- Gitee