diff --git a/git-commit b/git-commit index 32bdcfdfafab015dc6545da9010b1af023e4ec36..b29fd71f9780768da8a2effdf50279a22ae03032 100644 --- a/git-commit +++ b/git-commit @@ -1 +1 @@ -a26b349287fa182791c9009480361f35f0e93597 +5b7e5feb12b4d53a03cda29bbad6906415089c8c diff --git a/patch/0155-runc-fix-CVE-2024-21626.patch b/patch/0155-runc-fix-CVE-2024-21626.patch new file mode 100644 index 0000000000000000000000000000000000000000..891a3048d3033931aa8183b241918fcfd365b7db --- /dev/null +++ b/patch/0155-runc-fix-CVE-2024-21626.patch @@ -0,0 +1,1105 @@ +From 548b8d6159d13965efdf968b018dd6c81a6a128d Mon Sep 17 00:00:00 2001 +From: zhongjiawei +Date: Thu, 1 Feb 2024 17:25:53 +0800 +Subject: [PATCH] runc:fix CVE-2024-21626 + +--- + libcontainer/container_linux.go | 50 +++- + libcontainer/factory_linux.go | 15 +- + libcontainer/init_linux.go | 39 ++- + libcontainer/process_linux.go | 3 +- + libcontainer/setns_init_linux.go | 20 ++ + libcontainer/standard_init_linux.go | 28 +- + libcontainer/standard_init_linux.go.orig | 223 --------------- + libcontainer/utils/utils.go | 38 --- + libcontainer/utils/utils_unix.go | 253 ++++++++++++++++-- + vendor/golang.org/x/sys/unix/flock.go | 5 + + .../x/sys/unix/zerrors_linux_amd64.go | 1 + + .../x/sys/unix/zerrors_linux_arm64.go | 1 + + .../x/sys/unix/zsyscall_linux_amd64.go | 10 + + .../x/sys/unix/zsyscall_linux_arm64.go | 10 + + .../x/sys/unix/zsysnum_linux_amd64.go | 1 + + .../x/sys/unix/zsysnum_linux_arm64.go | 1 + + .../x/sys/unix/ztypes_linux_amd64.go | 5 + + .../x/sys/unix/ztypes_linux_arm64.go | 5 + + 18 files changed, 404 insertions(+), 304 deletions(-) + delete mode 100644 libcontainer/standard_init_linux.go.orig + +diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go +index e0fb8b4..fd191b6 100644 +--- a/libcontainer/container_linux.go ++++ b/libcontainer/container_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -28,6 +29,7 @@ import ( + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" + "github.com/vishvananda/netlink/nl" ++ "golang.org/x/sys/unix" + ) + + const stdioFdCount = 3 +@@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error { + }() + } + ++ // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC ++ // to make sure we don't leak any files into "runc init". Any files to be ++ // passed to "runc init" through ExtraFiles will get dup2'd by the Go ++ // runtime and thus their O_CLOEXEC flag will be cleared. This is some ++ // additional protection against attacks like CVE-2024-21626, by making ++ // sure we never leak files to "runc init" we didn't intend to. ++ if err := utils.CloseExecFrom(3); err != nil { ++ return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) ++ } + if err := parent.start(); err != nil { + // terminate the process to ensure that it properly is reaped. + if err := parent.terminate(); err != nil { +@@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() { + os.Remove(fifoName) + } + ++// includeExecFifo opens the container's execfifo as a pathfd, so that the ++// container cannot access the statedir (and the FIFO itself remains ++// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited ++// fd, with _LIBCONTAINER_FIFOFD set to its fd number. ++func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { ++ fifoName := filepath.Join(c.root, execFifoFilename) ++ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) ++ if err != nil { ++ return err ++ } ++ ++ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) ++ cmd.Env = append(cmd.Env, ++ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) ++ return nil ++} ++ + func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + parentPipe, childPipe, err := utils.NewSockPair("init") + if err != nil { +@@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + } + +- // We only set up rootDir if we're not doing a `runc exec`. The reason for +- // this is to avoid cases where a racing, unprivileged process inside the +- // container can get access to the statedir file descriptor (which would +- // allow for container rootfs escape). +- rootDir, err := os.Open(c.root) +- if err != nil { +- return nil, err ++ // We only set up fifoFd if we're not doing a `runc exec`. The historic ++ // reason for this is that previously we would pass a dirfd that allowed ++ // for container rootfs escape (and not doing it in `runc exec` avoided ++ // that problem), but we no longer do that. However, there's no need to do ++ // this for `runc exec` so we just keep it this way to be safe. ++ if err := c.includeExecFifo(cmd); err != nil { ++ return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + } +- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) +- cmd.Env = append(cmd.Env, +- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) +- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) ++ return c.newInitProcess(p, cmd, parentPipe, childPipe) + } + + func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +@@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. + return cmd, nil + } + +-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { ++func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { +@@ -502,7 +527,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c + process: p, + bootstrapData: data, + sharePidns: sharePidns, +- rootDir: rootDir, + }, nil + } + +diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go +index 0b2aa74..15ba017 100644 +--- a/libcontainer/factory_linux.go ++++ b/libcontainer/factory_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string { + // This is a low level implementation detail of the reexec and should not be consumed externally + func (l *LinuxFactory) StartInitialization() (err error) { + var ( +- pipefd, rootfd int ++ pipefd, fifofd int + consoleSocket *os.File + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") +- envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") ++ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") + envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") + ) + +@@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { + ) + defer pipe.Close() + +- // Only init processes have STATEDIR. +- rootfd = -1 ++ // Only init processes have FIFOFD. ++ fifofd = -1 + if it == initStandard { +- if rootfd, err = strconv.Atoi(envStateDir); err != nil { +- return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) ++ if fifofd, err = strconv.Atoi(envFifoFd); err != nil { ++ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) + } + } + +@@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { + } + }() + +- i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd) ++ i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd) + if err != nil { + return err + } +diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go +index e9a83e9..fd417ca 100644 +--- a/libcontainer/init_linux.go ++++ b/libcontainer/init_linux.go +@@ -1,18 +1,23 @@ ++//go:build linux + // +build linux + + package libcontainer + + import ( + "encoding/json" ++ "errors" + "fmt" + "io" + "net" + "os" ++ "path/filepath" + "strings" + "syscall" + "unsafe" + + "github.com/Sirupsen/logrus" ++ "golang.org/x/sys/unix" ++ + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/system" +@@ -66,7 +71,7 @@ type initer interface { + Init() error + } + +-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) { ++func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err +@@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi + consoleSocket: consoleSocket, + parentPid: syscall.Getppid(), + config: config, +- stateDirFD: stateDirFD, ++ fifoFd: fifoFd, + logFd: logFd, + }, nil + } +@@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error { + return nil + } + ++// verifyCwd ensures that the current directory is actually inside the mount ++// namespace root of the current process. ++func verifyCwd() error { ++ // getcwd(2) on Linux detects if cwd is outside of the rootfs of the ++ // current mount namespace root, and in that case prefixes "(unreachable)" ++ // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect ++ // when this happens and return ENOENT rather than returning a non-absolute ++ // path. In both cases we can therefore easily detect if we have an invalid ++ // cwd by checking the return value of getcwd(3). See getcwd(3) for more ++ // details, and CVE-2024-21626 for the security issue that motivated this ++ // check. ++ // ++ // We have to use unix.Getwd() here because os.Getwd() has a workaround for ++ // $PWD which involves doing stat(.), which can fail if the current ++ // directory is inaccessible to the container process. ++ if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { ++ return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") ++ } else if err != nil { ++ return fmt.Errorf("failed to verify if current working directory is safe: %w", err) ++ } else if !filepath.IsAbs(wd) { ++ // We shouldn't ever hit this, but check just in case. ++ return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) ++ } ++ return nil ++} ++ + // finalizeNamespace drops the caps, sets the correct user + // and working dir, and closes any leaked file descriptors + // before executing the command inside the namespace +@@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error { + if err := setupUser(config); err != nil { + return err + } ++ // Make sure our final working directory is inside the container. ++ if err := verifyCwd(); err != nil { ++ return err ++ } + if err := system.ClearKeepCaps(); err != nil { + return err + } +diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go +index 5cdc30c..e786419 100644 +--- a/libcontainer/process_linux.go ++++ b/libcontainer/process_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -204,7 +205,6 @@ type initProcess struct { + process *Process + bootstrapData io.Reader + sharePidns bool +- rootDir *os.File + } + + func (p *initProcess) pid() int { +@@ -257,7 +257,6 @@ func (p *initProcess) start() error { + err := p.cmd.Start() + p.process.ops = p + p.childPipe.Close() +- p.rootDir.Close() + logs.CloseChild() + if err != nil { + p.process.ops = nil +diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go +index e6dfbba..3356ccd 100644 +--- a/libcontainer/setns_init_linux.go ++++ b/libcontainer/setns_init_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -11,6 +12,7 @@ import ( + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" ++ "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux/label" + ) + +@@ -65,5 +67,23 @@ func (l *linuxSetnsInit) Init() error { + syscall.Close(l.logFd) + } + ++ // Close all file descriptors we are not passing to the container. This is ++ // necessary because the execve target could use internal runc fds as the ++ // execve path, potentially giving access to binary files from the host ++ // (which can then be opened by container processes, leading to container ++ // escapes). Note that because this operation will close any open file ++ // descriptors that are referenced by (*os.File) handles from underneath ++ // the Go runtime, we must not do any file operations after this point ++ // (otherwise the (*os.File) finaliser could close the wrong file). See ++ // CVE-2024-21626 for more information as to why this protection is ++ // necessary. ++ // ++ // This is not needed for runc-dmz, because the extra execve(2) step means ++ // that all O_CLOEXEC file descriptors have already been closed and thus ++ // the second execve(2) from runc-dmz cannot access internal file ++ // descriptors from runc. ++ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { ++ return err ++ } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + } +diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go +index 6236593..7ebf1a2 100644 +--- a/libcontainer/standard_init_linux.go ++++ b/libcontainer/standard_init_linux.go +@@ -1,3 +1,4 @@ ++//go:build linux + // +build linux + + package libcontainer +@@ -15,14 +16,17 @@ import ( + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" ++ "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux/label" ++ ++ "golang.org/x/sys/unix" + ) + + type linuxStandardInit struct { + pipe *os.File + consoleSocket *os.File + parentPid int +- stateDirFD int ++ fifoFd int + config *initConfig + logFd int + } +@@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error { + // exec'ing the users process. + ch := make(chan Error, 1) + go func() { +- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) ++ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) + if err != nil { + ch <- newSystemErrorWithCause(err, "openat exec fifo") + return +@@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error { + } + // close the statedir fd before exec because the kernel resets dumpable in the wrong order + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 +- syscall.Close(l.stateDirFD) ++ unix.Close(l.fifoFd) ++ // Close all file descriptors we are not passing to the container. This is ++ // necessary because the execve target could use internal runc fds as the ++ // execve path, potentially giving access to binary files from the host ++ // (which can then be opened by container processes, leading to container ++ // escapes). Note that because this operation will close any open file ++ // descriptors that are referenced by (*os.File) handles from underneath ++ // the Go runtime, we must not do any file operations after this point ++ // (otherwise the (*os.File) finaliser could close the wrong file). See ++ // CVE-2024-21626 for more information as to why this protection is ++ // necessary. ++ // ++ // This is not needed for runc-dmz, because the extra execve(2) step means ++ // that all O_CLOEXEC file descriptors have already been closed and thus ++ // the second execve(2) from runc-dmz cannot access internal file ++ // descriptors from runc. ++ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { ++ return err ++ } + if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCause(err, "exec user process") + } +diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig +deleted file mode 100644 +index 611b91d..0000000 +--- a/libcontainer/standard_init_linux.go.orig ++++ /dev/null +@@ -1,223 +0,0 @@ +-// +build linux +- +-package libcontainer +- +-import ( +- "fmt" +- "os" +- "os/exec" +- "strings" +- "syscall" +- "time" +- +- "github.com/opencontainers/runc/libcontainer/apparmor" +- "github.com/opencontainers/runc/libcontainer/configs" +- "github.com/opencontainers/runc/libcontainer/keys" +- "github.com/opencontainers/runc/libcontainer/seccomp" +- "github.com/opencontainers/runc/libcontainer/system" +- "github.com/opencontainers/selinux/go-selinux/label" +-) +- +-type linuxStandardInit struct { +- pipe *os.File +- consoleSocket *os.File +- parentPid int +- stateDirFD int +- config *initConfig +- logFd int +-} +- +-func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { +- var newperms uint32 +- +- if l.config.Config.Namespaces.Contains(configs.NEWUSER) { +- // with user ns we need 'other' search permissions +- newperms = 0x8 +- } else { +- // without user ns we need 'UID' search permissions +- newperms = 0x80000 +- } +- +- // create a unique per session container name that we can +- // join in setns; however, other containers can also join it +- return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms +-} +- +-// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value +-// the kernel +-const PR_SET_NO_NEW_PRIVS = 0x26 +- +-func (l *linuxStandardInit) Init() error { +- if !l.config.Config.NoNewKeyring { +- ringname, keepperms, newperms := l.getSessionRingParams() +- +- // do not inherit the parent's session keyring +- sessKeyId, err := keys.JoinSessionKeyring(ringname) +- if err != nil { +- return err +- } +- // make session keyring searcheable +- if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { +- return err +- } +- } +- +- if err := setupNetwork(l.config); err != nil { +- return err +- } +- if err := setupRoute(l.config.Config); err != nil { +- return err +- } +- +- label.Init() +- +- // prepareRootfs() can be executed only for a new mount namespace. +- if l.config.Config.Namespaces.Contains(configs.NEWNS) { +- if err := prepareRootfs(l.pipe, l.config.Config); err != nil { +- return err +- } +- } +- +- // Set up the console. This has to be done *before* we finalize the rootfs, +- // but *after* we've given the user the chance to set up all of the mounts +- // they wanted. +- if l.config.CreateConsole { +- if err := setupConsole(l.consoleSocket, l.config, true); err != nil { +- return err +- } +- if err := system.Setctty(); err != nil { +- return err +- } +- } +- +- // Finish the rootfs setup. +- if l.config.Config.Namespaces.Contains(configs.NEWNS) { +- if err := finalizeRootfs(l.config.Config); err != nil { +- return err +- } +- } +- +- if hostname := l.config.Config.Hostname; hostname != "" { +- if err := syscall.Sethostname([]byte(hostname)); err != nil { +- return err +- } +- } +- if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { +- return err +- } +- if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { +- return err +- } +- // when userns enabled, write to sysctl will fail, let docker-hooks do this job +- if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 { +- for key, value := range l.config.Config.Sysctl { +- if err := writeSystemProperty(key, value); err != nil { +- return err +- } +- } +- } +- for _, path := range l.config.Config.ReadonlyPaths { +- if err := readonlyPath(path); err != nil { +- return err +- } +- } +- for _, m := range l.config.Config.Mounts { +- if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") { +- if err := remountReadWrite(m.Destination); err != nil { +- return err +- } +- } +- } +- for _, path := range l.config.Config.MaskPaths { +- if err := maskPath(path); err != nil { +- return err +- } +- } +- pdeath, err := system.GetParentDeathSignal() +- if err != nil { +- return err +- } +- if l.config.NoNewPrivileges { +- if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { +- return err +- } +- } +- // Tell our parent that we're ready to Execv. This must be done before the +- // Seccomp rules have been applied, because we need to be able to read and +- // write to a socket. +- if err := syncParentReady(l.pipe); err != nil { +- return err +- } +- // Without NoNewPrivileges seccomp is a privileged operation, so we need to +- // do this before dropping capabilities; otherwise do it as late as possible +- // just before execve so as few syscalls take place after it as possible. +- if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { +- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { +- return err +- } +- } +- if err := finalizeNamespace(l.config); err != nil { +- return err +- } +- // finalizeNamespace can change user/group which clears the parent death +- // signal, so we restore it here. +- if err := pdeath.Restore(); err != nil { +- return err +- } +- // compare the parent from the initial start of the init process and make sure that it did not change. +- // if the parent changes that means it died and we were reparented to something else so we should +- // just kill ourself and not cause problems for someone else. +- if syscall.Getppid() != l.parentPid { +- return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) +- } +- // check for the arg before waiting to make sure it exists and it is returned +- // as a create time error. +- name, err := exec.LookPath(l.config.Args[0]) +- if err != nil { +- return err +- } +- // close the pipe to signal that we have completed our init. +- l.pipe.Close() +- +- if l.logFd != 0 { +- syscall.Close(l.logFd) +- } +- +- // wait for the fifo to be opened on the other side before +- // exec'ing the users process. +- ch := make(chan Error, 1) +- go func() { +- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) +- if err != nil { +- ch <- newSystemErrorWithCause(err, "openat exec fifo") +- return +- } +- if _, err := syscall.Write(fd, []byte("0")); err != nil { +- ch <- newSystemErrorWithCause(err, "write 0 exec fifo") +- return +- } +- ch <- nil +- }() +- +- select { +- case chErr := <-ch: +- if chErr != nil { +- return chErr +- } +- case <-time.After(120 * time.Second): +- return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ") +- } +- +- if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { +- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { +- return newSystemErrorWithCause(err, "init seccomp") +- } +- } +- // close the statedir fd before exec because the kernel resets dumpable in the wrong order +- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 +- syscall.Close(l.stateDirFD) +- if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { +- return newSystemErrorWithCause(err, "exec user process") +- } +- return nil +-} +diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go +index bdd13d4..a269488 100644 +--- a/libcontainer/utils/utils.go ++++ b/libcontainer/utils/utils.go +@@ -4,17 +4,12 @@ import ( + "crypto/rand" + "encoding/hex" + "encoding/json" +- "fmt" + "io" + "os" + "path/filepath" +- "strconv" + "strings" + "syscall" + "unsafe" +- +- securejoin "github.com/cyphar/filepath-securejoin" +- "golang.org/x/sys/unix" + ) + + const ( +@@ -160,36 +155,3 @@ func stripRoot(root, path string) string { + } + return CleanPath("/" + path) + } +- +-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +-// corresponding to the unsafePath resolved within the root. Before passing the +-// fd, this path is verified to have been inside the root -- so operating on it +-// through the passed fdpath should be safe. Do not access this path through +-// the original path strings, and do not attempt to use the pathname outside of +-// the passed closure (the file handle will be freed once the closure returns). +-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { +- // Remove the root then forcefully resolve inside the root. +- unsafePath = stripRoot(root, unsafePath) +- path, err := securejoin.SecureJoin(root, unsafePath) +- if err != nil { +- return fmt.Errorf("resolving path inside rootfs failed: %v", err) +- } +- +- // Open the target path. +- fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) +- if err != nil { +- return fmt.Errorf("open o_path procfd: %w", err) +- } +- defer fh.Close() +- +- // Double-check the path is the one we expected. +- procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) +- if realpath, err := os.Readlink(procfd); err != nil { +- return fmt.Errorf("procfd verification failed: %w", err) +- } else if realpath != path { +- return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) +- } +- +- // Run the closure. +- return fn(procfd) +-} +diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go +index 7b798cc..c031968 100644 +--- a/libcontainer/utils/utils_unix.go ++++ b/libcontainer/utils/utils_unix.go +@@ -1,43 +1,264 @@ ++///go:build !windows ++//go:build !windows + // +build !windows + + package utils + + import ( +- "io/ioutil" ++ "fmt" ++ "math" + "os" ++ "path/filepath" ++ "runtime" + "strconv" +- "syscall" ++ "sync" ++ _ "unsafe" // for go:linkname ++ ++ securejoin "github.com/cyphar/filepath-securejoin" ++ "github.com/Sirupsen/logrus" ++ "golang.org/x/sys/unix" + ) + +-func CloseExecFrom(minFd int) error { +- fdList, err := ioutil.ReadDir("/proc/self/fd") ++// EnsureProcHandle returns whether or not the given file handle is on procfs. ++func EnsureProcHandle(fh *os.File) error { ++ var buf unix.Statfs_t ++ if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { ++ return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) ++ } ++ if buf.Type != unix.PROC_SUPER_MAGIC { ++ return fmt.Errorf("%s is not on procfs", fh.Name()) ++ } ++ return nil ++} ++ ++var ( ++ haveCloseRangeCloexecBool bool ++ haveCloseRangeCloexecOnce sync.Once ++) ++ ++func haveCloseRangeCloexec() bool { ++ haveCloseRangeCloexecOnce.Do(func() { ++ // Make sure we're not closing a random file descriptor. ++ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) ++ if err != nil { ++ return ++ } ++ defer unix.Close(tmpFd) ++ ++ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) ++ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). ++ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any ++ // other potential error would imply that even the most basic close ++ // operation wouldn't work. ++ haveCloseRangeCloexecBool = err == nil ++ }) ++ return haveCloseRangeCloexecBool ++} ++ ++type fdFunc func(fd int) ++ ++// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in ++// the current process. ++func fdRangeFrom(minFd int, fn fdFunc) error { ++ procSelfFd, closer := ProcThreadSelf("fd") ++ defer closer() ++ ++ fdDir, err := os.Open(procSelfFd) ++ if err != nil { ++ return err ++ } ++ defer fdDir.Close() ++ ++ if err := EnsureProcHandle(fdDir); err != nil { ++ return err ++ } ++ ++ fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } +- for _, fi := range fdList { +- fd, err := strconv.Atoi(fi.Name()) ++ for _, fdStr := range fdList { ++ fd, err := strconv.Atoi(fdStr) ++ // Ignore non-numeric file names. + if err != nil { +- // ignore non-numeric file names + continue + } +- ++ // Ignore descriptors lower than our specified minimum. + if fd < minFd { +- // ignore descriptors lower than our specified minimum + continue + } +- +- // intentionally ignore errors from syscall.CloseOnExec +- syscall.CloseOnExec(fd) +- // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) ++ // Ignore the file descriptor we used for readdir, as it will be closed ++ // when we return. ++ if uintptr(fd) == fdDir.Fd() { ++ continue ++ } ++ // Run the closure. ++ fn(fd) + } + return nil + } + +-// NewSockPair returns a new unix socket pair +-func NewSockPair(name string) (parent *os.File, child *os.File, err error) { +- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) ++// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or ++// equal to minFd in the current process. ++func CloseExecFrom(minFd int) error { ++ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. ++ if haveCloseRangeCloexec() { ++ err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC) ++ return os.NewSyscallError("close_range", err) ++ } ++ // Otherwise, fall back to the standard loop. ++ return fdRangeFrom(minFd, unix.CloseOnExec) ++} ++ ++//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor ++ ++// In order to make sure we do not close the internal epoll descriptors the Go ++// runtime uses, we need to ensure that we skip descriptors that match ++// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, ++// unfortunately there's no other way to be sure we're only keeping the file ++// descriptors the Go runtime needs. Hopefully nothing blows up doing this... ++func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive ++ ++// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the ++// current process, except for those critical to Go's runtime (such as the ++// netpoll management descriptors). ++// ++// NOTE: That this function is incredibly dangerous to use in most Go code, as ++// closing file descriptors from underneath *os.File handles can lead to very ++// bad behaviour (the closed file descriptor can be re-used and then any ++// *os.File operations would apply to the wrong file). This function is only ++// intended to be called from the last stage of runc init. ++func UnsafeCloseFrom(minFd int) error { ++ // We cannot use close_range(2) even if it is available, because we must ++ // not close some file descriptors. ++ return fdRangeFrom(minFd, func(fd int) { ++ if runtime_IsPollDescriptor(uintptr(fd)) { ++ // These are the Go runtimes internal netpoll file descriptors. ++ // These file descriptors are operated on deep in the Go scheduler, ++ // and closing those files from underneath Go can result in panics. ++ // There is no issue with keeping them because they are not ++ // executable and are not useful to an attacker anyway. Also we ++ // don't have any choice. ++ return ++ } ++ // There's nothing we can do about errors from close(2), and the ++ // only likely error to be seen is EBADF which indicates the fd was ++ // already closed (in which case, we got what we wanted). ++ _ = unix.Close(fd) ++ }) ++} ++ ++// NewSockPair returns a new SOCK_STREAM unix socket pair. ++func NewSockPair(name string) (parent, child *os.File, err error) { ++ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil + } ++ ++// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) ++// corresponding to the unsafePath resolved within the root. Before passing the ++// fd, this path is verified to have been inside the root -- so operating on it ++// through the passed fdpath should be safe. Do not access this path through ++// the original path strings, and do not attempt to use the pathname outside of ++// the passed closure (the file handle will be freed once the closure returns). ++func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { ++ // Remove the root then forcefully resolve inside the root. ++ unsafePath = stripRoot(root, unsafePath) ++ path, err := securejoin.SecureJoin(root, unsafePath) ++ if err != nil { ++ return fmt.Errorf("resolving path inside rootfs failed: %w", err) ++ } ++ ++ procSelfFd, closer := ProcThreadSelf("fd/") ++ defer closer() ++ ++ // Open the target path. ++ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) ++ if err != nil { ++ return fmt.Errorf("open o_path procfd: %w", err) ++ } ++ defer fh.Close() ++ ++ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) ++ // Double-check the path is the one we expected. ++ if realpath, err := os.Readlink(procfd); err != nil { ++ return fmt.Errorf("procfd verification failed: %w", err) ++ } else if realpath != path { ++ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) ++ } ++ ++ return fn(procfd) ++} ++ ++type ProcThreadSelfCloser func() ++ ++var ( ++ haveProcThreadSelf bool ++ haveProcThreadSelfOnce sync.Once ++) ++ ++// ProcThreadSelf returns a string that is equivalent to ++// /proc/thread-self/, with a graceful fallback on older kernels where ++// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, ++// meaning that the passed string needs to be trusted. The caller _must_ call ++// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) ++// *only once* after it has finished using the returned path string. ++func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { ++ haveProcThreadSelfOnce.Do(func() { ++ if _, err := os.Stat("/proc/thread-self/"); err == nil { ++ haveProcThreadSelf = true ++ } else { ++ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) ++ } ++ }) ++ ++ // We need to lock our thread until the caller is done with the path string ++ // because any non-atomic operation on the path (such as opening a file, ++ // then reading it) could be interrupted by the Go runtime where the ++ // underlying thread is swapped out and the original thread is killed, ++ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In ++ // addition, the pre-3.17 fallback makes everything non-atomic because the ++ // same thing could happen between unix.Gettid() and the path operations. ++ // ++ // In theory, we don't need to lock in the atomic user case when using ++ // /proc/thread-self/, but it's better to be safe than sorry (and there are ++ // only one or two truly atomic users of /proc/thread-self/). ++ runtime.LockOSThread() ++ ++ threadSelf := "/proc/thread-self/" ++ if !haveProcThreadSelf { ++ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. ++ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" ++ if _, err := os.Stat(threadSelf); err != nil { ++ // Unfortunately, this code is called from rootfs_linux.go where we ++ // are running inside the pid namespace of the container but /proc ++ // is the host's procfs. Unfortunately there is no real way to get ++ // the correct tid to use here (the kernel age means we cannot do ++ // things like set up a private fsopen("proc") -- even scanning ++ // NSpid in all of the tasks in /proc/self/task/*/status requires ++ // Linux 4.1). ++ // ++ // So, we just have to assume that /proc/self is acceptable in this ++ // one specific case. ++ if os.Getpid() == 1 { ++ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) ++ } else { ++ // This should never happen, but the fallback should work in most cases... ++ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) ++ } ++ threadSelf = "/proc/self/" ++ } ++ } ++ return threadSelf + subpath, runtime.UnlockOSThread ++} ++ ++// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to ++// create a /proc/thread-self handle for given file descriptor. ++// ++// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but ++// without using fmt.Sprintf to avoid unneeded overhead. ++func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { ++ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) ++} +diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go +index ce67a59..e8d1081 100644 +--- a/vendor/golang.org/x/sys/unix/flock.go ++++ b/vendor/golang.org/x/sys/unix/flock.go +@@ -14,6 +14,11 @@ import "unsafe" + // systems by flock_linux_32bit.go to be SYS_FCNTL64. + var fcntl64Syscall uintptr = SYS_FCNTL + ++// FcntlInt performs a fcntl syscall on fd with the provided command and argument. ++func FcntlInt(fd uintptr, cmd, arg int) (int, error) { ++ return fcntl(int(fd), cmd, arg) ++} ++ + // FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command. + func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error { + _, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk))) +diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +index f21dcd9..e1bde81 100644 +--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +@@ -934,6 +934,7 @@ const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 ++ PROC_SUPER_MAGIC = 0x9fa0 + PROT_EXEC = 0x4 + PROT_GROWSDOWN = 0x1000000 + PROT_GROWSUP = 0x2000000 +diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +index 16a18f5..388d1fc 100644 +--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +@@ -966,6 +966,7 @@ const ( + PRIO_PGRP = 0x1 + PRIO_PROCESS = 0x0 + PRIO_USER = 0x2 ++ PROC_SUPER_MAGIC = 0x9fa0 + PROT_EXEC = 0x4 + PROT_GROWSDOWN = 0x1000000 + PROT_GROWSUP = 0x2000000 +diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +index 8b2e87d..fe21f83 100644 +--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +@@ -312,6 +312,16 @@ func Close(fd int) (err error) { + + // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + ++func CloseRange(first uint, last uint, flags uint) (err error) { ++ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) ++ if e1 != 0 { ++ err = errnoErr(e1) ++ } ++ return ++} ++ ++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT ++ + func Dup(oldfd int) (fd int, err error) { + r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) + fd = int(r0) +diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +index f6cc320..395e2de 100644 +--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +@@ -312,6 +312,16 @@ func Close(fd int) (err error) { + + // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + ++func CloseRange(first uint, last uint, flags uint) (err error) { ++ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) ++ if e1 != 0 { ++ err = errnoErr(e1) ++ } ++ return ++} ++ ++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT ++ + func Dup(oldfd int) (fd int, err error) { + r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) + fd = int(r0) +diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +index 9042317..f7c427c 100644 +--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +@@ -338,4 +338,5 @@ const ( + SYS_PKEY_MPROTECT = 329 + SYS_PKEY_ALLOC = 330 + SYS_PKEY_FREE = 331 ++ SYS_CLOSE_RANGE = 436 + ) +diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +index 90e43d0..530563a 100644 +--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +@@ -282,4 +282,5 @@ const ( + SYS_PKEY_MPROTECT = 288 + SYS_PKEY_ALLOC = 289 + SYS_PKEY_FREE = 290 ++ SYS_CLOSE_RANGE = 436 + ) +diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +index c9e1e64..2f12811 100644 +--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go ++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +@@ -345,6 +345,11 @@ type TCPInfo struct { + Total_retrans uint32 + } + ++const ( ++ CLOSE_RANGE_UNSHARE = 0x2 ++ CLOSE_RANGE_CLOEXEC = 0x4 ++) ++ + const ( + SizeofSockaddrInet4 = 0x10 + SizeofSockaddrInet6 = 0x1c +diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +index e58c500..b77eceb 100644 +--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go ++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +@@ -30,6 +30,11 @@ type Timeval struct { + Usec int64 + } + ++const ( ++ CLOSE_RANGE_UNSHARE = 0x2 ++ CLOSE_RANGE_CLOEXEC = 0x4 ++) ++ + type Timex struct { + Modes uint32 + Pad_cgo_0 [4]byte +-- +2.33.0 + diff --git a/runc.spec b/runc.spec index c62a062b9516e6e93e691181c2d2f9c0f9890b79..c9516374e958db530a51ab554abff2cde14948ac 100644 --- a/runc.spec +++ b/runc.spec @@ -4,7 +4,7 @@ Name: docker-runc Version: 1.0.0.rc3 -Release: 319 +Release: 320 Summary: runc is a CLI tool for spawning and running containers according to the OCI specification. License: ASL 2.0 @@ -57,6 +57,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc %{_bindir}/runc %changelog +* Thu Feb 1 2024 zhongjiawei - 1.0.0.rc3-320 +- Type:CVE +- CVE:CVE-2024-21626 +- SUG:NA +- DESC:fix RootDir fd leaks + * Fri Dec 8 2023 zhongjiawei - 1.0.0.rc3-319 - Type:bugfix - CVE:NA diff --git a/series.conf b/series.conf index 51b36e5d672703da42540e47fdadbb4d5e798a7c..d1f8acef589c0c10675d290dccfed223ce09b0f1 100644 --- a/series.conf +++ b/series.conf @@ -154,4 +154,5 @@ patch/0151-runc-fix-update-rt-runtime-us-and-rt-period-us-.patch patch/0152-runc-libcontainer-create-Cwd-when-it-does-not-exist.patch patch/0153-runc-delete-do-not-ignore-error-from-destroy.patch patch/0154-libct-Destroy-don-t-proceed-in-case-of-errors.patch +patch/0155-runc-fix-CVE-2024-21626.patch #end