From 1a6ece72043f497f98aa5f6c3d845395a04fe202 Mon Sep 17 00:00:00 2001
From: zhongjiawei <zhongjiawei1@huawei.com>
Date: Thu, 1 Feb 2024 16:51:35 +0800
Subject: [PATCH] runc:fix CVE-2024-21626

(cherry picked from commit 6e9b77988428e4184978084eccfa08612f3c5b0f)
---
 patch/0148-runc-fix-CVE-2024-21626.patch | 2765 ++++++++++++++++++++++
 runc.spec                                |    8 +-
 series.conf                              |    1 +
 3 files changed, 2773 insertions(+), 1 deletion(-)
 create mode 100644 patch/0148-runc-fix-CVE-2024-21626.patch

diff --git a/patch/0148-runc-fix-CVE-2024-21626.patch b/patch/0148-runc-fix-CVE-2024-21626.patch
new file mode 100644
index 0000000..cb87495
--- /dev/null
+++ b/patch/0148-runc-fix-CVE-2024-21626.patch
@@ -0,0 +1,2765 @@
+From e81938064402940ca8176d6f3145f65b1d455996 Mon Sep 17 00:00:00 2001
+From: zhongjiawei <zhongjiawei1@huawei.com>
+Date: Thu, 1 Feb 2024 18:25:16 +0800
+Subject: [PATCH] runc:fix CVE-2024-21626
+
+---
+ libcontainer/container_linux.go               |   50 +-
+ libcontainer/container_linux.go.orig          | 1660 -----------------
+ libcontainer/factory_linux.go                 |   15 +-
+ libcontainer/init_linux.go                    |   39 +-
+ libcontainer/process_linux.go                 |    3 +-
+ libcontainer/setns_init_linux.go              |   19 +
+ libcontainer/standard_init_linux.go           |   28 +-
+ libcontainer/standard_init_linux.go.orig      |  223 ---
+ libcontainer/utils/utils.go                   |   38 -
+ libcontainer/utils/utils_unix.go              |  253 ++-
+ vendor/golang.org/x/sys/unix/flock.go         |    5 +
+ .../x/sys/unix/zerrors_linux_amd64.go         |    1 +
+ .../x/sys/unix/zerrors_linux_arm64.go         |    1 +
+ .../x/sys/unix/zsyscall_linux_amd64.go        |   10 +
+ .../x/sys/unix/zsyscall_linux_arm64.go        |   10 +
+ .../x/sys/unix/zsysnum_linux_amd64.go         |    1 +
+ .../x/sys/unix/zsysnum_linux_arm64.go         |    1 +
+ .../x/sys/unix/ztypes_linux_amd64.go          |    5 +
+ .../x/sys/unix/ztypes_linux_arm64.go          |    5 +
+ 19 files changed, 403 insertions(+), 1964 deletions(-)
+ delete mode 100644 libcontainer/container_linux.go.orig
+ delete mode 100644 libcontainer/standard_init_linux.go.orig
+
+diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
+index a4859ca..c757d71 100644
+--- a/libcontainer/container_linux.go
++++ b/libcontainer/container_linux.go
+@@ -1,3 +1,4 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+@@ -28,6 +29,7 @@ import (
+ 	"github.com/opencontainers/runc/libcontainer/utils"
+ 	"github.com/syndtr/gocapability/capability"
+ 	"github.com/vishvananda/netlink/nl"
++	"golang.org/x/sys/unix"
+ )
+ 
+ const stdioFdCount = 3
+@@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error {
+ 		}()
+ 	}
+ 
++	// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
++	// to make sure we don't leak any files into "runc init". Any files to be
++	// passed to "runc init" through ExtraFiles will get dup2'd by the Go
++	// runtime and thus their O_CLOEXEC flag will be cleared. This is some
++	// additional protection against attacks like CVE-2024-21626, by making
++	// sure we never leak files to "runc init" we didn't intend to.
++	if err := utils.CloseExecFrom(3); err != nil {
++		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
++	}
+ 	if err := parent.start(); err != nil {
+ 		// terminate the process to ensure that it properly is reaped.
+ 		if err := parent.terminate(); err != nil {
+@@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() {
+ 	os.Remove(fifoName)
+ }
+ 
++// includeExecFifo opens the container's execfifo as a pathfd, so that the
++// container cannot access the statedir (and the FIFO itself remains
++// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
++// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
++func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
++	fifoName := filepath.Join(c.root, execFifoFilename)
++	fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
++	if err != nil {
++		return err
++	}
++
++	cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
++	cmd.Env = append(cmd.Env,
++		fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
++	return nil
++}
++
+ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
+ 	parentPipe, childPipe, err := utils.NewSockPair("init")
+ 	if err != nil {
+@@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
+ 		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
+ 	}
+ 
+-	// We only set up rootDir if we're not doing a `runc exec`. The reason for
+-	// this is to avoid cases where a racing, unprivileged process inside the
+-	// container can get access to the statedir file descriptor (which would
+-	// allow for container rootfs escape).
+-	rootDir, err := os.Open(c.root)
+-	if err != nil {
+-		return nil, err
++	// We only set up fifoFd if we're not doing a `runc exec`. The historic
++	// reason for this is that previously we would pass a dirfd that allowed
++	// for container rootfs escape (and not doing it in `runc exec` avoided
++	// that problem), but we no longer do that. However, there's no need to do
++	// this for `runc exec` so we just keep it this way to be safe.
++	if err := c.includeExecFifo(cmd); err != nil {
++		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+ 	}
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
+-	cmd.Env = append(cmd.Env,
+-		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+-	return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
++	return c.newInitProcess(p, cmd, parentPipe, childPipe)
+ }
+ 
+ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
+@@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
+ 	return cmd, nil
+ }
+ 
+-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
++func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
+ 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
+ 	nsMaps := make(map[configs.NamespaceType]string)
+ 	for _, ns := range c.config.Namespaces {
+@@ -501,7 +526,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
+ 		process:       p,
+ 		bootstrapData: data,
+ 		sharePidns:    !c.config.Namespaces.IsPrivate(configs.NEWPID),
+-		rootDir:       rootDir,
+ 	}, nil
+ }
+ 
+diff --git a/libcontainer/container_linux.go.orig b/libcontainer/container_linux.go.orig
+deleted file mode 100644
+index d678407..0000000
+--- a/libcontainer/container_linux.go.orig
++++ /dev/null
+@@ -1,1660 +0,0 @@
+-// +build linux
+-
+-package libcontainer
+-
+-import (
+-	"bytes"
+-	"encoding/json"
+-	"errors"
+-	"fmt"
+-	"io"
+-	"io/ioutil"
+-	"os"
+-	"os/exec"
+-	"path/filepath"
+-	"reflect"
+-	"strings"
+-	"sync"
+-	"syscall"
+-	"time"
+-
+-	"github.com/Sirupsen/logrus"
+-	"github.com/golang/protobuf/proto"
+-	"github.com/opencontainers/runc/libcontainer/cgroups"
+-	"github.com/opencontainers/runc/libcontainer/configs"
+-	"github.com/opencontainers/runc/libcontainer/criurpc"
+-	"github.com/opencontainers/runc/libcontainer/logs"
+-	"github.com/opencontainers/runc/libcontainer/system"
+-	"github.com/opencontainers/runc/libcontainer/utils"
+-	"github.com/syndtr/gocapability/capability"
+-	"github.com/vishvananda/netlink/nl"
+-)
+-
+-const stdioFdCount = 3
+-
+-type linuxContainer struct {
+-	id                   string
+-	root                 string
+-	config               *configs.Config
+-	cgroupManager        cgroups.Manager
+-	initArgs             []string
+-	initProcess          parentProcess
+-	initProcessStartTime string
+-	criuPath             string
+-	m                    sync.Mutex
+-	criuVersion          int
+-	state                containerState
+-	created              time.Time
+-}
+-
+-// State represents a running container's state
+-type State struct {
+-	BaseState
+-
+-	// Platform specific fields below here
+-
+-	// Specifies if the container was started under the rootless mode.
+-	Rootless bool `json:"rootless"`
+-
+-	// Path to all the cgroups setup for a container. Key is cgroup subsystem name
+-	// with the value as the path.
+-	CgroupPaths map[string]string `json:"cgroup_paths"`
+-
+-	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
+-	// with the value as the path.
+-	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
+-
+-	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
+-	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
+-}
+-
+-// CompatState
+-type CompatState struct {
+-	State
+-	Config configs.CompatConfig `json:"config"`
+-}
+-
+-// Container is a libcontainer container object.
+-//
+-// Each container is thread-safe within the same process. Since a container can
+-// be destroyed by a separate process, any function may return that the container
+-// was not found.
+-type Container interface {
+-	BaseContainer
+-
+-	// Methods below here are platform specific
+-
+-	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
+-	//
+-	// errors:
+-	// Systemerror - System error.
+-	Checkpoint(criuOpts *CriuOpts) error
+-
+-	// Restore restores the checkpointed container to a running state using the criu(8) utility.
+-	//
+-	// errors:
+-	// Systemerror - System error.
+-	Restore(process *Process, criuOpts *CriuOpts) error
+-
+-	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
+-	// the execution of any user processes. Asynchronously, when the container finished being paused the
+-	// state is changed to PAUSED.
+-	// If the Container state is PAUSED, do nothing.
+-	//
+-	// errors:
+-	// ContainerNotExists - Container no longer exists,
+-	// ContainerNotRunning - Container not running or created,
+-	// Systemerror - System error.
+-	Pause() error
+-
+-	// If the Container state is PAUSED, resumes the execution of any user processes in the
+-	// Container before setting the Container state to RUNNING.
+-	// If the Container state is RUNNING, do nothing.
+-	//
+-	// errors:
+-	// ContainerNotExists - Container no longer exists,
+-	// ContainerNotPaused - Container is not paused,
+-	// Systemerror - System error.
+-	Resume() error
+-
+-	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
+-	//
+-	// errors:
+-	// Systemerror - System error.
+-	NotifyOOM() (<-chan struct{}, error)
+-
+-	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
+-	//
+-	// errors:
+-	// Systemerror - System error.
+-	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
+-}
+-
+-// ID returns the container's unique ID
+-func (c *linuxContainer) ID() string {
+-	return c.id
+-}
+-
+-// Config returns the container's configuration
+-func (c *linuxContainer) Config() configs.Config {
+-	return *c.config
+-}
+-
+-func (c *linuxContainer) Status() (Status, error) {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	return c.currentStatus()
+-}
+-
+-func (c *linuxContainer) State() (*State, error) {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	return c.currentState()
+-}
+-
+-func (c *linuxContainer) Processes() ([]int, error) {
+-	pids, err := c.cgroupManager.GetAllPids()
+-	if err != nil {
+-		return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
+-	}
+-	return pids, nil
+-}
+-
+-func (c *linuxContainer) Stats() (*Stats, error) {
+-	var (
+-		err   error
+-		stats = &Stats{}
+-	)
+-	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
+-		return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
+-	}
+-	for _, iface := range c.config.Networks {
+-		switch iface.Type {
+-		case "veth":
+-			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
+-			if err != nil {
+-				return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
+-			}
+-			stats.Interfaces = append(stats.Interfaces, istats)
+-		}
+-	}
+-	return stats, nil
+-}
+-
+-func (c *linuxContainer) Set(config configs.Config) error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	status, err := c.currentStatus()
+-	if err != nil {
+-		return err
+-	}
+-	if status == Stopped {
+-		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+-	}
+-	c.config = &config
+-	return c.cgroupManager.Set(c.config)
+-}
+-
+-func (c *linuxContainer) Start(process *Process) error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	if process.Init {
+-		if err := c.createExecFifo(); err != nil {
+-			return err
+-		}
+-	}
+-	if err := c.start(process); err != nil {
+-		if process.Init {
+-			c.deleteExecFifo()
+-		}
+-		return err
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) Run(process *Process) error {
+-	if err := c.Start(process); err != nil {
+-		return err
+-	}
+-	if process.Init {
+-		return c.exec()
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) Exec() error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	return c.exec()
+-}
+-
+-func (c *linuxContainer) exec() error {
+-	path := filepath.Join(c.root, execFifoFilename)
+-
+-	fifoOpen := make(chan struct{})
+-	select {
+-	case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
+-		return errors.New("container process is already dead")
+-	case result := <-awaitFifoOpen(path, fifoOpen):
+-		if result.err != nil {
+-			return result.err
+-		}
+-		f := result.file
+-		defer f.Close()
+-		if err := readFromExecFifo(f); err != nil {
+-			return err
+-		}
+-		if err := os.Remove(path); !os.IsNotExist(err) {
+-			return err
+-		}
+-		return nil
+-	}
+-}
+-
+-func readFromExecFifo(execFifo io.Reader) error {
+-	data, err := ioutil.ReadAll(execFifo)
+-	if err != nil {
+-		return err
+-	}
+-	if len(data) <= 0 {
+-		return fmt.Errorf("cannot start an already running container")
+-	}
+-	return nil
+-}
+-
+-func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
+-	isDead := make(chan struct{})
+-	go func() {
+-		for {
+-			select {
+-			case <-exit:
+-				return
+-			case <-time.After(time.Millisecond * 100):
+-				stat, err := system.GetProcessState(pid)
+-				if err != nil || stat == system.Zombie {
+-					select {
+-					case <-exit:
+-						return
+-					default:
+-						close(isDead)
+-					}
+-					return
+-				}
+-			}
+-		}
+-	}()
+-	return isDead
+-}
+-
+-func awaitFifoOpen(path string, fifoOpen chan struct{}) <-chan openResult {
+-	fifoOpened := make(chan openResult)
+-	go func() {
+-		f, err := os.OpenFile(path, os.O_RDONLY, 0)
+-		close(fifoOpen)
+-		if err != nil {
+-			fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
+-			return
+-		}
+-		fifoOpened <- openResult{file: f}
+-	}()
+-	return fifoOpened
+-}
+-
+-type openResult struct {
+-	file *os.File
+-	err  error
+-}
+-
+-func (c *linuxContainer) start(process *Process) error {
+-	parent, err := c.newParentProcess(process)
+-	if err != nil {
+-		return newSystemErrorWithCause(err, "creating new parent process")
+-	}
+-
+-	if logsDone := logs.ForwardLogs(); logsDone != nil {
+-		defer func() {
+-			select {
+-			case <-logsDone:
+-			case <-time.After(3 * time.Second):
+-				logrus.Warnf("wait child close logfd timeout")
+-			}
+-		}()
+-	}
+-
+-	if err := parent.start(); err != nil {
+-		// terminate the process to ensure that it properly is reaped.
+-		if err := parent.terminate(); err != nil {
+-			logrus.Warnf("parent process terminate error: %v", err)
+-		}
+-		return newSystemErrorWithCause(err, "starting container process")
+-	}
+-	// generate a timestamp indicating when the container was started
+-	c.created = time.Now().UTC()
+-	if process.Init {
+-		c.state = &createdState{
+-			c: c,
+-		}
+-		state, err := c.updateState(parent)
+-		if err != nil {
+-			return err
+-		}
+-		c.initProcessStartTime = state.InitProcessStartTime
+-
+-		if c.config.Hooks != nil {
+-			s := configs.HookState{
+-				SpecState: configs.SpecState{
+-					Version: c.config.Version,
+-					ID:      c.id,
+-					Pid:     parent.pid(),
+-					Bundle:  utils.SearchLabels(c.config.Labels, "bundle"),
+-				},
+-				Root: c.config.Rootfs,
+-			}
+-			for i, hook := range c.config.Hooks.Poststart {
+-				logrus.Infof("run poststart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
+-				if err := hook.Run(s); err != nil {
+-					logrus.Warnf("running poststart hook %d:%s failed: %s, ContainerId: %s", i, hook.Info(), err, s.ID)
+-				}
+-			}
+-		}
+-	} else {
+-		c.state = &runningState{
+-			c: c,
+-		}
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) Signal(s os.Signal, all bool) error {
+-	if all {
+-		return signalAllProcesses(c.cgroupManager, s)
+-	}
+-	status, err := c.currentStatus()
+-	if err != nil {
+-		return err
+-	}
+-	// to avoid a PID reuse attack
+-	if status == Running || status == Created {
+-		if err := c.initProcess.signal(s); err != nil {
+-			return newSystemErrorWithCause(err, "signaling init process")
+-		}
+-		return nil
+-	}
+-	return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
+-}
+-
+-func (c *linuxContainer) createExecFifo() error {
+-	rootuid, err := c.Config().HostRootUID()
+-	if err != nil {
+-		return err
+-	}
+-	rootgid, err := c.Config().HostRootGID()
+-	if err != nil {
+-		return err
+-	}
+-
+-	fifoName := filepath.Join(c.root, execFifoFilename)
+-	if _, err := os.Stat(fifoName); err == nil {
+-		return fmt.Errorf("exec fifo %s already exists", fifoName)
+-	}
+-	oldMask := syscall.Umask(0000)
+-	if err := syscall.Mkfifo(fifoName, 0622); err != nil {
+-		syscall.Umask(oldMask)
+-		return err
+-	}
+-	syscall.Umask(oldMask)
+-	if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
+-		return err
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) deleteExecFifo() {
+-	fifoName := filepath.Join(c.root, execFifoFilename)
+-	os.Remove(fifoName)
+-}
+-
+-func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
+-	parentPipe, childPipe, err := utils.NewSockPair("init")
+-	if err != nil {
+-		return nil, newSystemErrorWithCause(err, "creating new init pipe")
+-	}
+-	if err := logs.InitLogPipe(); err != nil {
+-		return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
+-	}
+-	cmd, err := c.commandTemplate(p, childPipe)
+-	if err != nil {
+-		return nil, newSystemErrorWithCause(err, "creating new command template")
+-	}
+-	if !p.Init {
+-		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
+-	}
+-
+-	// We only set up rootDir if we're not doing a `runc exec`. The reason for
+-	// this is to avoid cases where a racing, unprivileged process inside the
+-	// container can get access to the statedir file descriptor (which would
+-	// allow for container rootfs escape).
+-	rootDir, err := os.Open(c.root)
+-	if err != nil {
+-		return nil, err
+-	}
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
+-	cmd.Env = append(cmd.Env,
+-		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+-	return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
+-}
+-
+-func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
+-	cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
+-	cmd.Stdin = p.Stdin
+-	cmd.Stdout = p.Stdout
+-	cmd.Stderr = p.Stderr
+-	cmd.Dir = c.config.Rootfs
+-	if cmd.SysProcAttr == nil {
+-		cmd.SysProcAttr = &syscall.SysProcAttr{}
+-	}
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
+-	if p.ConsoleSocket != nil {
+-		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
+-		cmd.Env = append(cmd.Env,
+-			fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+-		)
+-	}
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
+-	cmd.Env = append(cmd.Env,
+-		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+-	)
+-
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, logs.ChildLogPipe)
+-	cmd.Env = append(cmd.Env,
+-		fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+-	)
+-
+-	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
+-	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
+-	// even with the parent still running.
+-	if c.config.ParentDeathSignal > 0 {
+-		cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
+-	}
+-	return cmd, nil
+-}
+-
+-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
+-	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
+-	nsMaps := make(map[configs.NamespaceType]string)
+-	for _, ns := range c.config.Namespaces {
+-		if ns.Path != "" {
+-			nsMaps[ns.Type] = ns.Path
+-		}
+-	}
+-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
+-	if err != nil {
+-		return nil, err
+-	}
+-	return &initProcess{
+-		cmd:           cmd,
+-		childPipe:     childPipe,
+-		parentPipe:    parentPipe,
+-		manager:       c.cgroupManager,
+-		config:        c.newInitConfig(p),
+-		container:     c,
+-		process:       p,
+-		bootstrapData: data,
+-		sharePidns:    !c.config.Namespaces.IsPrivate(configs.NEWPID),
+-		rootDir:       rootDir,
+-	}, nil
+-}
+-
+-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
+-	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
+-	state, err := c.currentState()
+-	if err != nil {
+-		return nil, newSystemErrorWithCause(err, "getting container's current state")
+-	}
+-	// for setns process, we don't have to set cloneflags as the process namespaces
+-	// will only be set via setns syscall
+-	data, err := c.bootstrapData(0, state.NamespacePaths)
+-	if err != nil {
+-		return nil, err
+-	}
+-	return &setnsProcess{
+-		cmd:           cmd,
+-		cgroupPaths:   c.cgroupManager.GetPaths(),
+-		childPipe:     childPipe,
+-		parentPipe:    parentPipe,
+-		config:        c.newInitConfig(p),
+-		process:       p,
+-		bootstrapData: data,
+-	}, nil
+-}
+-
+-func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
+-	cfg := &initConfig{
+-		Config:           c.config,
+-		Args:             process.Args,
+-		Env:              process.Env,
+-		User:             process.User,
+-		AdditionalGroups: process.AdditionalGroups,
+-		Cwd:              process.Cwd,
+-		Capabilities:     process.Capabilities,
+-		PassedFilesCount: len(process.ExtraFiles),
+-		ContainerId:      c.ID(),
+-		NoNewPrivileges:  c.config.NoNewPrivileges,
+-		Rootless:         c.config.Rootless,
+-		AppArmorProfile:  c.config.AppArmorProfile,
+-		ProcessLabel:     c.config.ProcessLabel,
+-		Rlimits:          c.config.Rlimits,
+-	}
+-	if process.NoNewPrivileges != nil {
+-		cfg.NoNewPrivileges = *process.NoNewPrivileges
+-	}
+-	if process.AppArmorProfile != "" {
+-		cfg.AppArmorProfile = process.AppArmorProfile
+-	}
+-	if process.Label != "" {
+-		cfg.ProcessLabel = process.Label
+-	}
+-	if len(process.Rlimits) > 0 {
+-		cfg.Rlimits = process.Rlimits
+-	}
+-	cfg.CreateConsole = process.ConsoleSocket != nil
+-	return cfg
+-}
+-
+-func (c *linuxContainer) Destroy() error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	return c.state.destroy()
+-}
+-
+-func (c *linuxContainer) Pause() error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	status, err := c.currentStatus()
+-	if err != nil {
+-		return err
+-	}
+-	switch status {
+-	case Running, Created:
+-		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
+-			return err
+-		}
+-		return c.state.transition(&pausedState{
+-			c: c,
+-		})
+-	}
+-	return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
+-}
+-
+-func (c *linuxContainer) Resume() error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-	status, err := c.currentStatus()
+-	if err != nil {
+-		return err
+-	}
+-	if status != Paused {
+-		return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
+-	}
+-	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
+-		return err
+-	}
+-	return c.state.transition(&runningState{
+-		c: c,
+-	})
+-}
+-
+-func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
+-	// XXX(cyphar): This requires cgroups.
+-	if c.config.Rootless {
+-		return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
+-	}
+-	return notifyOnOOM(c.cgroupManager.GetPaths())
+-}
+-
+-func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
+-	// XXX(cyphar): This requires cgroups.
+-	if c.config.Rootless {
+-		return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
+-	}
+-	return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
+-}
+-
+-var criuFeatures *criurpc.CriuFeatures
+-
+-func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
+-
+-	var t criurpc.CriuReqType
+-	t = criurpc.CriuReqType_FEATURE_CHECK
+-
+-	if err := c.checkCriuVersion("1.8"); err != nil {
+-		// Feature checking was introduced with CRIU 1.8.
+-		// Ignore the feature check if an older CRIU version is used
+-		// and just act as before.
+-		// As all automated PR testing is done using CRIU 1.7 this
+-		// code will not be tested by automated PR testing.
+-		return nil
+-	}
+-
+-	// make sure the features we are looking for are really not from
+-	// some previous check
+-	criuFeatures = nil
+-
+-	req := &criurpc.CriuReq{
+-		Type: &t,
+-		// Theoretically this should not be necessary but CRIU
+-		// segfaults if Opts is empty.
+-		// Fixed in CRIU  2.12
+-		Opts:     rpcOpts,
+-		Features: criuFeat,
+-	}
+-
+-	err := c.criuSwrk(nil, req, criuOpts, false)
+-	if err != nil {
+-		logrus.Debugf("%s", err)
+-		return fmt.Errorf("CRIU feature check failed")
+-	}
+-
+-	logrus.Debugf("Feature check says: %s", criuFeatures)
+-	missingFeatures := false
+-
+-	if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
+-		missingFeatures = true
+-		logrus.Debugf("CRIU does not support MemTrack")
+-	}
+-
+-	if missingFeatures {
+-		return fmt.Errorf("CRIU is missing features")
+-	}
+-
+-	return nil
+-}
+-
+-// checkCriuVersion checks Criu version greater than or equal to minVersion
+-func (c *linuxContainer) checkCriuVersion(minVersion string) error {
+-	var x, y, z, versionReq int
+-
+-	_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
+-	if err != nil {
+-		_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
+-	}
+-	versionReq = x*10000 + y*100 + z
+-
+-	out, err := exec.Command(c.criuPath, "-V").Output()
+-	if err != nil {
+-		return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
+-	}
+-
+-	x = 0
+-	y = 0
+-	z = 0
+-	if ep := strings.Index(string(out), "-"); ep >= 0 {
+-		// criu Git version format
+-		var version string
+-		if sp := strings.Index(string(out), "GitID"); sp > 0 {
+-			version = string(out)[sp:ep]
+-		} else {
+-			return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
+-		}
+-
+-		n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
+-		if err != nil {
+-			n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
+-			y++
+-		} else {
+-			z++
+-		}
+-		if n < 2 || err != nil {
+-			return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
+-		}
+-	} else {
+-		// criu release version format
+-		n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
+-		if err != nil {
+-			n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
+-		}
+-		if n < 2 || err != nil {
+-			return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
+-		}
+-	}
+-
+-	c.criuVersion = x*10000 + y*100 + z
+-
+-	if c.criuVersion < versionReq {
+-		return fmt.Errorf("CRIU version must be %s or higher", minVersion)
+-	}
+-
+-	return nil
+-}
+-
+-const descriptorsFilename = "descriptors.json"
+-
+-func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
+-	mountDest := m.Destination
+-	if strings.HasPrefix(mountDest, c.config.Rootfs) {
+-		mountDest = mountDest[len(c.config.Rootfs):]
+-	}
+-
+-	extMnt := &criurpc.ExtMountMap{
+-		Key: proto.String(mountDest),
+-		Val: proto.String(mountDest),
+-	}
+-	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+-}
+-
+-func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
+-	for _, path := range c.config.MaskPaths {
+-		fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
+-		if err != nil {
+-			if os.IsNotExist(err) {
+-				continue
+-			}
+-			return err
+-		}
+-		if fi.IsDir() {
+-			continue
+-		}
+-
+-		extMnt := &criurpc.ExtMountMap{
+-			Key: proto.String(path),
+-			Val: proto.String("/dev/null"),
+-		}
+-		req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+-	}
+-
+-	return nil
+-}
+-
+-func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-
+-	// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
+-	//               support for doing unprivileged dumps, but the setup of
+-	//               rootless containers might make this complicated.
+-	if c.config.Rootless {
+-		return fmt.Errorf("cannot checkpoint a rootless container")
+-	}
+-
+-	if err := c.checkCriuVersion("1.5.2"); err != nil {
+-		return err
+-	}
+-
+-	if criuOpts.ImagesDirectory == "" {
+-		return fmt.Errorf("invalid directory to save checkpoint")
+-	}
+-
+-	// Since a container can be C/R'ed multiple times,
+-	// the checkpoint directory may already exist.
+-	if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
+-		return err
+-	}
+-
+-	if criuOpts.WorkDirectory == "" {
+-		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+-	}
+-
+-	if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
+-		return err
+-	}
+-
+-	workDir, err := os.Open(criuOpts.WorkDirectory)
+-	if err != nil {
+-		return err
+-	}
+-	defer workDir.Close()
+-
+-	imageDir, err := os.Open(criuOpts.ImagesDirectory)
+-	if err != nil {
+-		return err
+-	}
+-	defer imageDir.Close()
+-
+-	rpcOpts := criurpc.CriuOpts{
+-		ImagesDirFd:    proto.Int32(int32(imageDir.Fd())),
+-		WorkDirFd:      proto.Int32(int32(workDir.Fd())),
+-		LogLevel:       proto.Int32(4),
+-		LogFile:        proto.String("dump.log"),
+-		Root:           proto.String(c.config.Rootfs),
+-		ManageCgroups:  proto.Bool(true),
+-		NotifyScripts:  proto.Bool(true),
+-		Pid:            proto.Int32(int32(c.initProcess.pid())),
+-		ShellJob:       proto.Bool(criuOpts.ShellJob),
+-		LeaveRunning:   proto.Bool(criuOpts.LeaveRunning),
+-		TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
+-		ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
+-		FileLocks:      proto.Bool(criuOpts.FileLocks),
+-		EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
+-	}
+-
+-	// append optional criu opts, e.g., page-server and port
+-	if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
+-		rpcOpts.Ps = &criurpc.CriuPageServerInfo{
+-			Address: proto.String(criuOpts.PageServer.Address),
+-			Port:    proto.Int32(criuOpts.PageServer.Port),
+-		}
+-	}
+-
+-	//pre-dump may need parentImage param to complete iterative migration
+-	if criuOpts.ParentImage != "" {
+-		rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
+-		rpcOpts.TrackMem = proto.Bool(true)
+-	}
+-
+-	// append optional manage cgroups mode
+-	if criuOpts.ManageCgroupsMode != 0 {
+-		if err := c.checkCriuVersion("1.7"); err != nil {
+-			return err
+-		}
+-		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+-		rpcOpts.ManageCgroupsMode = &mode
+-	}
+-
+-	var t criurpc.CriuReqType
+-	if criuOpts.PreDump {
+-		feat := criurpc.CriuFeatures{
+-			MemTrack: proto.Bool(true),
+-		}
+-
+-		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
+-			return err
+-		}
+-
+-		t = criurpc.CriuReqType_PRE_DUMP
+-	} else {
+-		t = criurpc.CriuReqType_DUMP
+-	}
+-	req := &criurpc.CriuReq{
+-		Type: &t,
+-		Opts: &rpcOpts,
+-	}
+-
+-	//no need to dump these information in pre-dump
+-	if !criuOpts.PreDump {
+-		for _, m := range c.config.Mounts {
+-			switch m.Device {
+-			case "bind":
+-				c.addCriuDumpMount(req, m)
+-				break
+-			case "cgroup":
+-				binds, err := getCgroupMounts(m)
+-				if err != nil {
+-					return err
+-				}
+-				for _, b := range binds {
+-					c.addCriuDumpMount(req, b)
+-				}
+-				break
+-			}
+-		}
+-
+-		if err := c.addMaskPaths(req); err != nil {
+-			return err
+-		}
+-
+-		for _, node := range c.config.Devices {
+-			m := &configs.Mount{Destination: node.Path, Source: node.Path}
+-			c.addCriuDumpMount(req, m)
+-		}
+-
+-		// Write the FD info to a file in the image directory
+-		fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
+-		if err != nil {
+-			return err
+-		}
+-
+-		err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
+-		if err != nil {
+-			return err
+-		}
+-	}
+-
+-	err = c.criuSwrk(nil, req, criuOpts, false)
+-	if err != nil {
+-		return err
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
+-	mountDest := m.Destination
+-	if strings.HasPrefix(mountDest, c.config.Rootfs) {
+-		mountDest = mountDest[len(c.config.Rootfs):]
+-	}
+-
+-	extMnt := &criurpc.ExtMountMap{
+-		Key: proto.String(mountDest),
+-		Val: proto.String(m.Source),
+-	}
+-	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
+-}
+-
+-func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
+-	for _, iface := range c.config.Networks {
+-		switch iface.Type {
+-		case "veth":
+-			veth := new(criurpc.CriuVethPair)
+-			veth.IfOut = proto.String(iface.HostInterfaceName)
+-			veth.IfIn = proto.String(iface.Name)
+-			req.Opts.Veths = append(req.Opts.Veths, veth)
+-			break
+-		case "loopback":
+-			break
+-		}
+-	}
+-	for _, i := range criuOpts.VethPairs {
+-		veth := new(criurpc.CriuVethPair)
+-		veth.IfOut = proto.String(i.HostInterfaceName)
+-		veth.IfIn = proto.String(i.ContainerInterfaceName)
+-		req.Opts.Veths = append(req.Opts.Veths, veth)
+-	}
+-}
+-
+-func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
+-	c.m.Lock()
+-	defer c.m.Unlock()
+-
+-	// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
+-	//               support for unprivileged restore at the moment.
+-	if c.config.Rootless {
+-		return fmt.Errorf("cannot restore a rootless container")
+-	}
+-
+-	if err := c.checkCriuVersion("1.5.2"); err != nil {
+-		return err
+-	}
+-	if criuOpts.WorkDirectory == "" {
+-		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
+-	}
+-	// Since a container can be C/R'ed multiple times,
+-	// the work directory may already exist.
+-	if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
+-		return err
+-	}
+-	workDir, err := os.Open(criuOpts.WorkDirectory)
+-	if err != nil {
+-		return err
+-	}
+-	defer workDir.Close()
+-	if criuOpts.ImagesDirectory == "" {
+-		return fmt.Errorf("invalid directory to restore checkpoint")
+-	}
+-	imageDir, err := os.Open(criuOpts.ImagesDirectory)
+-	if err != nil {
+-		return err
+-	}
+-	defer imageDir.Close()
+-	// CRIU has a few requirements for a root directory:
+-	// * it must be a mount point
+-	// * its parent must not be overmounted
+-	// c.config.Rootfs is bind-mounted to a temporary directory
+-	// to satisfy these requirements.
+-	root := filepath.Join(c.root, "criu-root")
+-	if err := os.Mkdir(root, 0755); err != nil {
+-		return err
+-	}
+-	defer os.Remove(root)
+-	root, err = filepath.EvalSymlinks(root)
+-	if err != nil {
+-		return err
+-	}
+-	err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
+-	if err != nil {
+-		return err
+-	}
+-	defer syscall.Unmount(root, syscall.MNT_DETACH)
+-	t := criurpc.CriuReqType_RESTORE
+-	req := &criurpc.CriuReq{
+-		Type: &t,
+-		Opts: &criurpc.CriuOpts{
+-			ImagesDirFd:    proto.Int32(int32(imageDir.Fd())),
+-			WorkDirFd:      proto.Int32(int32(workDir.Fd())),
+-			EvasiveDevices: proto.Bool(true),
+-			LogLevel:       proto.Int32(4),
+-			LogFile:        proto.String("restore.log"),
+-			RstSibling:     proto.Bool(true),
+-			Root:           proto.String(root),
+-			ManageCgroups:  proto.Bool(true),
+-			NotifyScripts:  proto.Bool(true),
+-			ShellJob:       proto.Bool(criuOpts.ShellJob),
+-			ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
+-			TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
+-			FileLocks:      proto.Bool(criuOpts.FileLocks),
+-			EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
+-		},
+-	}
+-
+-	for _, m := range c.config.Mounts {
+-		switch m.Device {
+-		case "bind":
+-			c.addCriuRestoreMount(req, m)
+-			break
+-		case "cgroup":
+-			binds, err := getCgroupMounts(m)
+-			if err != nil {
+-				return err
+-			}
+-			for _, b := range binds {
+-				c.addCriuRestoreMount(req, b)
+-			}
+-			break
+-		}
+-	}
+-
+-	if len(c.config.MaskPaths) > 0 {
+-		m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
+-		c.addCriuRestoreMount(req, m)
+-	}
+-
+-	for _, node := range c.config.Devices {
+-		m := &configs.Mount{Destination: node.Path, Source: node.Path}
+-		c.addCriuRestoreMount(req, m)
+-	}
+-
+-	if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
+-		c.restoreNetwork(req, criuOpts)
+-	}
+-
+-	// append optional manage cgroups mode
+-	if criuOpts.ManageCgroupsMode != 0 {
+-		if err := c.checkCriuVersion("1.7"); err != nil {
+-			return err
+-		}
+-		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
+-		req.Opts.ManageCgroupsMode = &mode
+-	}
+-
+-	var (
+-		fds    []string
+-		fdJSON []byte
+-	)
+-	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
+-		return err
+-	}
+-
+-	if err := json.Unmarshal(fdJSON, &fds); err != nil {
+-		return err
+-	}
+-	for i := range fds {
+-		if s := fds[i]; strings.Contains(s, "pipe:") {
+-			inheritFd := new(criurpc.InheritFd)
+-			inheritFd.Key = proto.String(s)
+-			inheritFd.Fd = proto.Int32(int32(i))
+-			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
+-		}
+-	}
+-	return c.criuSwrk(process, req, criuOpts, true)
+-}
+-
+-func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
+-	// XXX: Do we need to deal with this case? AFAIK criu still requires root.
+-	if err := c.cgroupManager.Apply(pid); err != nil {
+-		return err
+-	}
+-
+-	if err := c.cgroupManager.Set(c.config); err != nil {
+-		return newSystemError(err)
+-	}
+-
+-	path := fmt.Sprintf("/proc/%d/cgroup", pid)
+-	cgroupsPaths, err := cgroups.ParseCgroupFile(path)
+-	if err != nil {
+-		return err
+-	}
+-
+-	for c, p := range cgroupsPaths {
+-		cgroupRoot := &criurpc.CgroupRoot{
+-			Ctrl: proto.String(c),
+-			Path: proto.String(p),
+-		}
+-		req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
+-	}
+-
+-	return nil
+-}
+-
+-func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
+-	fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
+-	if err != nil {
+-		return err
+-	}
+-
+-	logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
+-	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
+-	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
+-	defer criuClient.Close()
+-	defer criuServer.Close()
+-
+-	args := []string{"swrk", "3"}
+-	logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
+-	logrus.Debugf("Using CRIU with following args: %s", args)
+-	cmd := exec.Command(c.criuPath, args...)
+-	if process != nil {
+-		cmd.Stdin = process.Stdin
+-		cmd.Stdout = process.Stdout
+-		cmd.Stderr = process.Stderr
+-	}
+-	cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
+-
+-	if err := cmd.Start(); err != nil {
+-		return err
+-	}
+-	criuServer.Close()
+-
+-	defer func() {
+-		criuClient.Close()
+-		_, err := cmd.Process.Wait()
+-		if err != nil {
+-			return
+-		}
+-	}()
+-
+-	if applyCgroups {
+-		err := c.criuApplyCgroups(cmd.Process.Pid, req)
+-		if err != nil {
+-			return err
+-		}
+-	}
+-
+-	var extFds []string
+-	if process != nil {
+-		extFds, err = getPipeFds(cmd.Process.Pid)
+-		if err != nil {
+-			return err
+-		}
+-	}
+-
+-	logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
+-	// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
+-	// should be empty. For older CRIU versions it still will be
+-	// available but empty.
+-	if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK {
+-		val := reflect.ValueOf(req.GetOpts())
+-		v := reflect.Indirect(val)
+-		for i := 0; i < v.NumField(); i++ {
+-			st := v.Type()
+-			name := st.Field(i).Name
+-			if strings.HasPrefix(name, "XXX_") {
+-				continue
+-			}
+-			value := val.MethodByName("Get" + name).Call([]reflect.Value{})
+-			logrus.Debugf("CRIU option %s with value %v", name, value[0])
+-		}
+-	}
+-	data, err := proto.Marshal(req)
+-	if err != nil {
+-		return err
+-	}
+-	_, err = criuClient.Write(data)
+-	if err != nil {
+-		return err
+-	}
+-
+-	buf := make([]byte, 10*4096)
+-	for true {
+-		n, err := criuClient.Read(buf)
+-		if err != nil {
+-			return err
+-		}
+-		if n == 0 {
+-			return fmt.Errorf("unexpected EOF")
+-		}
+-		if n == len(buf) {
+-			return fmt.Errorf("buffer is too small")
+-		}
+-
+-		resp := new(criurpc.CriuResp)
+-		err = proto.Unmarshal(buf[:n], resp)
+-		if err != nil {
+-			return err
+-		}
+-		if !resp.GetSuccess() {
+-			typeString := req.GetType().String()
+-			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
+-		}
+-
+-		t := resp.GetType()
+-		switch {
+-		case t == criurpc.CriuReqType_FEATURE_CHECK:
+-			logrus.Debugf("Feature check says: %s", resp)
+-			criuFeatures = resp.GetFeatures()
+-			break
+-		case t == criurpc.CriuReqType_NOTIFY:
+-			if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
+-				return err
+-			}
+-			t = criurpc.CriuReqType_NOTIFY
+-			req = &criurpc.CriuReq{
+-				Type:          &t,
+-				NotifySuccess: proto.Bool(true),
+-			}
+-			data, err = proto.Marshal(req)
+-			if err != nil {
+-				return err
+-			}
+-			_, err = criuClient.Write(data)
+-			if err != nil {
+-				return err
+-			}
+-			continue
+-		case t == criurpc.CriuReqType_RESTORE:
+-		case t == criurpc.CriuReqType_DUMP:
+-			break
+-		case t == criurpc.CriuReqType_PRE_DUMP:
+-			// In pre-dump mode CRIU is in a loop and waits for
+-			// the final DUMP command.
+-			// The current runc pre-dump approach, however, is
+-			// start criu in PRE_DUMP once for a single pre-dump
+-			// and not the whole series of pre-dump, pre-dump, ...m, dump
+-			// If we got the message CriuReqType_PRE_DUMP it means
+-			// CRIU was successful and we need to forcefully stop CRIU
+-			logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
+-			criuClient.Close()
+-			// Process status won't be success, because one end of sockets is closed
+-			_, err := cmd.Process.Wait()
+-			if err != nil {
+-				logrus.Debugf("After PRE_DUMP CRIU exiting failed")
+-				return err
+-			}
+-			return nil
+-		default:
+-			return fmt.Errorf("unable to parse the response %s", resp.String())
+-		}
+-
+-		break
+-	}
+-
+-	// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
+-	// Here we want to wait only the CRIU process.
+-	st, err := cmd.Process.Wait()
+-	if err != nil {
+-		return err
+-	}
+-	if !st.Success() {
+-		return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
+-	}
+-	return nil
+-}
+-
+-// block any external network activity
+-func lockNetwork(config *configs.Config) error {
+-	for _, config := range config.Networks {
+-		strategy, err := getStrategy(config.Type)
+-		if err != nil {
+-			return err
+-		}
+-
+-		if err := strategy.detach(config); err != nil {
+-			return err
+-		}
+-	}
+-	return nil
+-}
+-
+-func unlockNetwork(config *configs.Config) error {
+-	for _, config := range config.Networks {
+-		strategy, err := getStrategy(config.Type)
+-		if err != nil {
+-			return err
+-		}
+-		if err = strategy.attach(config); err != nil {
+-			return err
+-		}
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
+-	notify := resp.GetNotify()
+-	if notify == nil {
+-		return fmt.Errorf("invalid response: %s", resp.String())
+-	}
+-	switch {
+-	case notify.GetScript() == "post-dump":
+-		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
+-		if err != nil {
+-			return err
+-		}
+-		f.Close()
+-	case notify.GetScript() == "network-unlock":
+-		if err := unlockNetwork(c.config); err != nil {
+-			return err
+-		}
+-	case notify.GetScript() == "network-lock":
+-		if err := lockNetwork(c.config); err != nil {
+-			return err
+-		}
+-	case notify.GetScript() == "setup-namespaces":
+-		if c.config.Hooks != nil {
+-			s := configs.HookState{
+-				SpecState: configs.SpecState{
+-					Version: c.config.Version,
+-					ID:      c.id,
+-					Pid:     int(notify.GetPid()),
+-					Bundle:  utils.SearchLabels(c.config.Labels, "bundle"),
+-				},
+-				Root: c.config.Rootfs,
+-			}
+-			for i, hook := range c.config.Hooks.Prestart {
+-				logrus.Infof("run prestart hook: %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
+-				if err := hook.Run(s); err != nil {
+-					return newSystemErrorWithCausef(err, "running prestart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
+-				}
+-				logrus.Infof("prestart hook: %d:%s done", i, hook.Info())
+-			}
+-		}
+-	case notify.GetScript() == "post-restore":
+-		pid := notify.GetPid()
+-		r, err := newRestoredProcess(int(pid), fds)
+-		if err != nil {
+-			return err
+-		}
+-		process.ops = r
+-		if err := c.state.transition(&restoredState{
+-			imageDir: opts.ImagesDirectory,
+-			c:        c,
+-		}); err != nil {
+-			return err
+-		}
+-		// create a timestamp indicating when the restored checkpoint was started
+-		c.created = time.Now().UTC()
+-		if _, err := c.updateState(r); err != nil {
+-			return err
+-		}
+-		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
+-			if !os.IsNotExist(err) {
+-				logrus.Error(err)
+-			}
+-		}
+-	}
+-	return nil
+-}
+-
+-func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
+-	c.initProcess = process
+-	state, err := c.currentState()
+-	if err != nil {
+-		return nil, err
+-	}
+-	err = c.saveState(state)
+-	if err != nil {
+-		return nil, err
+-	}
+-	return state, nil
+-}
+-
+-func (c *linuxContainer) saveState(s *State) error {
+-	f, err := os.Create(filepath.Join(c.root, stateFilename))
+-	if err != nil {
+-		return err
+-	}
+-	defer f.Close()
+-	return utils.WriteJSON(f, s)
+-}
+-
+-func (c *linuxContainer) deleteState() error {
+-	return os.Remove(filepath.Join(c.root, stateFilename))
+-}
+-
+-func (c *linuxContainer) currentStatus() (Status, error) {
+-	if err := c.refreshState(); err != nil {
+-		return -1, err
+-	}
+-	return c.state.status(), nil
+-}
+-
+-// refreshState needs to be called to verify that the current state on the
+-// container is what is true.  Because consumers of libcontainer can use it
+-// out of process we need to verify the container's status based on runtime
+-// information and not rely on our in process info.
+-func (c *linuxContainer) refreshState() error {
+-	paused, err := c.isPaused()
+-	if err != nil {
+-		return err
+-	}
+-	if paused {
+-		return c.state.transition(&pausedState{c: c})
+-	}
+-	t, err := c.runType()
+-	if err != nil {
+-		return err
+-	}
+-	switch t {
+-	case Created:
+-		return c.state.transition(&createdState{c: c})
+-	case Running:
+-		return c.state.transition(&runningState{c: c})
+-	}
+-	return c.state.transition(&stoppedState{c: c})
+-}
+-
+-// doesInitProcessExist checks if the init process is still the same process
+-// as the initial one, it could happen that the original process has exited
+-// and a new process has been created with the same pid, in this case, the
+-// container would already be stopped.
+-func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
+-	startTime, err := system.GetProcessStartTime(initPid)
+-	if err != nil {
+-		return false, nil
+-	}
+-	if c.initProcessStartTime != startTime {
+-		return false, nil
+-	}
+-	return true, nil
+-}
+-
+-func (c *linuxContainer) runType() (Status, error) {
+-	if c.initProcess == nil {
+-		return Stopped, nil
+-	}
+-	pid := c.initProcess.pid()
+-	// return Running if the init process is alive
+-	if err := syscall.Kill(pid, 0); err != nil {
+-		if err == syscall.ESRCH {
+-			// It means the process does not exist anymore, could happen when the
+-			// process exited just when we call the function, we should not return
+-			// error in this case.
+-			return Stopped, nil
+-		}
+-		return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
+-	}
+-	// check if the process is still the original init process.
+-	exist, err := c.doesInitProcessExist(pid)
+-	if !exist || err != nil {
+-		return Stopped, err
+-	}
+-	// We'll create exec fifo and blocking on it after container is created,
+-	// and delete it after start container.
+-	if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
+-		return Created, nil
+-	}
+-	return Running, nil
+-}
+-
+-func (c *linuxContainer) isPaused() (bool, error) {
+-	fcg := c.cgroupManager.GetPaths()["freezer"]
+-	if fcg == "" {
+-		// A container doesn't have a freezer cgroup
+-		return false, nil
+-	}
+-	data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
+-	if err != nil {
+-		// If freezer cgroup is not mounted, the container would just be not paused.
+-		if os.IsNotExist(err) {
+-			return false, nil
+-		}
+-		return false, newSystemErrorWithCause(err, "checking if container is paused")
+-	}
+-	return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
+-}
+-
+-func (c *linuxContainer) currentState() (*State, error) {
+-	var (
+-		startTime           string
+-		externalDescriptors []string
+-		pid                 = -1
+-	)
+-	if c.initProcess != nil {
+-		pid = c.initProcess.pid()
+-		startTime, _ = c.initProcess.startTime()
+-		externalDescriptors = c.initProcess.externalDescriptors()
+-	}
+-	state := &State{
+-		BaseState: BaseState{
+-			ID:                   c.ID(),
+-			Config:               *c.config,
+-			InitProcessPid:       pid,
+-			InitProcessStartTime: startTime,
+-			Created:              c.created,
+-		},
+-		Rootless:            c.config.Rootless,
+-		CgroupPaths:         c.cgroupManager.GetPaths(),
+-		NamespacePaths:      make(map[configs.NamespaceType]string),
+-		ExternalDescriptors: externalDescriptors,
+-	}
+-	if pid > 0 {
+-		for _, ns := range c.config.Namespaces {
+-			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+-		}
+-		for _, nsType := range configs.NamespaceTypes() {
+-			if !configs.IsNamespaceSupported(nsType) {
+-				continue
+-			}
+-			if _, ok := state.NamespacePaths[nsType]; !ok {
+-				ns := configs.Namespace{Type: nsType}
+-				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
+-			}
+-		}
+-	}
+-	return state, nil
+-}
+-
+-// orderNamespacePaths sorts namespace paths into a list of paths that we
+-// can setns in order.
+-func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
+-	paths := []string{}
+-
+-	for _, ns := range configs.NamespaceTypes() {
+-
+-		// Remove namespaces that we don't need to join.
+-		if !c.config.Namespaces.Contains(ns) {
+-			continue
+-		}
+-
+-		if p, ok := namespaces[ns]; ok && p != "" {
+-			// check if the requested namespace is supported
+-			if !configs.IsNamespaceSupported(ns) {
+-				return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
+-			}
+-			// only set to join this namespace if it exists
+-			if _, err := os.Lstat(p); err != nil {
+-				return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
+-			}
+-			// do not allow namespace path with comma as we use it to separate
+-			// the namespace paths
+-			if strings.ContainsRune(p, ',') {
+-				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
+-			}
+-			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
+-		}
+-
+-	}
+-
+-	return paths, nil
+-}
+-
+-func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
+-	data := bytes.NewBuffer(nil)
+-	for _, im := range idMap {
+-		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
+-		if _, err := data.WriteString(line); err != nil {
+-			return nil, err
+-		}
+-	}
+-	return data.Bytes(), nil
+-}
+-
+-// bootstrapData encodes the necessary data in netlink binary format
+-// as a io.Reader.
+-// Consumer can write the data to a bootstrap program
+-// such as one that uses nsenter package to bootstrap the container's
+-// init process correctly, i.e. with correct namespaces, uid/gid
+-// mapping etc.
+-func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
+-	// create the netlink message
+-	r := nl.NewNetlinkRequest(int(InitMsg), 0)
+-
+-	// write cloneFlags
+-	r.AddData(&Int32msg{
+-		Type:  CloneFlagsAttr,
+-		Value: uint32(cloneFlags),
+-	})
+-
+-	// write custom namespace paths
+-	if len(nsMaps) > 0 {
+-		nsPaths, err := c.orderNamespacePaths(nsMaps)
+-		if err != nil {
+-			return nil, err
+-		}
+-		r.AddData(&Bytemsg{
+-			Type:  NsPathsAttr,
+-			Value: []byte(strings.Join(nsPaths, ",")),
+-		})
+-	}
+-
+-	// write namespace paths only when we are not joining an existing user ns
+-	_, joinExistingUser := nsMaps[configs.NEWUSER]
+-	if !joinExistingUser {
+-		// write uid mappings
+-		if len(c.config.UidMappings) > 0 {
+-			b, err := encodeIDMapping(c.config.UidMappings)
+-			if err != nil {
+-				return nil, err
+-			}
+-			r.AddData(&Bytemsg{
+-				Type:  UidmapAttr,
+-				Value: b,
+-			})
+-		}
+-
+-		// write gid mappings
+-		if len(c.config.GidMappings) > 0 {
+-			b, err := encodeIDMapping(c.config.GidMappings)
+-			if err != nil {
+-				return nil, err
+-			}
+-			r.AddData(&Bytemsg{
+-				Type:  GidmapAttr,
+-				Value: b,
+-			})
+-			// The following only applies if we are root.
+-			if !c.config.Rootless {
+-				// check if we have CAP_SETGID to setgroup properly
+-				pid, err := capability.NewPid(os.Getpid())
+-				if err != nil {
+-					return nil, err
+-				}
+-				if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
+-					r.AddData(&Boolmsg{
+-						Type:  SetgroupAttr,
+-						Value: true,
+-					})
+-				}
+-			}
+-		}
+-	}
+-
+-	// write oom_score_adj
+-	r.AddData(&Bytemsg{
+-		Type:  OomScoreAdjAttr,
+-		Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
+-	})
+-
+-	// write rootless
+-	r.AddData(&Boolmsg{
+-		Type:  RootlessAttr,
+-		Value: c.config.Rootless,
+-	})
+-
+-	return bytes.NewReader(r.Serialize()), nil
+-}
+diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
+index 0b2aa74..15ba017 100644
+--- a/libcontainer/factory_linux.go
++++ b/libcontainer/factory_linux.go
+@@ -1,3 +1,4 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+@@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string {
+ // This is a low level implementation detail of the reexec and should not be consumed externally
+ func (l *LinuxFactory) StartInitialization() (err error) {
+ 	var (
+-		pipefd, rootfd int
++		pipefd, fifofd int
+ 		consoleSocket  *os.File
+ 		envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
+-		envStateDir    = os.Getenv("_LIBCONTAINER_STATEDIR")
++		envFifoFd      = os.Getenv("_LIBCONTAINER_FIFOFD")
+ 		envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
+ 	)
+ 
+@@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
+ 	)
+ 	defer pipe.Close()
+ 
+-	// Only init processes have STATEDIR.
+-	rootfd = -1
++	// Only init processes have FIFOFD.
++	fifofd = -1
+ 	if it == initStandard {
+-		if rootfd, err = strconv.Atoi(envStateDir); err != nil {
+-			return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
++		if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
++			return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
+ 		}
+ 	}
+ 
+@@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
+ 		}
+ 	}()
+ 
+-	i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd)
++	i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
+ 	if err != nil {
+ 		return err
+ 	}
+diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
+index e9a83e9..fd417ca 100644
+--- a/libcontainer/init_linux.go
++++ b/libcontainer/init_linux.go
+@@ -1,18 +1,23 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+ 
+ import (
+ 	"encoding/json"
++	"errors"
+ 	"fmt"
+ 	"io"
+ 	"net"
+ 	"os"
++	"path/filepath"
+ 	"strings"
+ 	"syscall"
+ 	"unsafe"
+ 
+ 	"github.com/Sirupsen/logrus"
++	"golang.org/x/sys/unix"
++
+ 	"github.com/opencontainers/runc/libcontainer/cgroups"
+ 	"github.com/opencontainers/runc/libcontainer/configs"
+ 	"github.com/opencontainers/runc/libcontainer/system"
+@@ -66,7 +71,7 @@ type initer interface {
+ 	Init() error
+ }
+ 
+-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) {
++func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
+ 	var config *initConfig
+ 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
+ 		return nil, err
+@@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi
+ 			consoleSocket: consoleSocket,
+ 			parentPid:     syscall.Getppid(),
+ 			config:        config,
+-			stateDirFD:    stateDirFD,
++			fifoFd:        fifoFd,
+ 			logFd:         logFd,
+ 		}, nil
+ 	}
+@@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error {
+ 	return nil
+ }
+ 
++// verifyCwd ensures that the current directory is actually inside the mount
++// namespace root of the current process.
++func verifyCwd() error {
++	// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
++	// current mount namespace root, and in that case prefixes "(unreachable)"
++	// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
++	// when this happens and return ENOENT rather than returning a non-absolute
++	// path. In both cases we can therefore easily detect if we have an invalid
++	// cwd by checking the return value of getcwd(3). See getcwd(3) for more
++	// details, and CVE-2024-21626 for the security issue that motivated this
++	// check.
++	//
++	// We have to use unix.Getwd() here because os.Getwd() has a workaround for
++	// $PWD which involves doing stat(.), which can fail if the current
++	// directory is inaccessible to the container process.
++	if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
++		return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
++	} else if err != nil {
++		return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
++	} else if !filepath.IsAbs(wd) {
++		// We shouldn't ever hit this, but check just in case.
++		return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
++	}
++	return nil
++}
++
+ // finalizeNamespace drops the caps, sets the correct user
+ // and working dir, and closes any leaked file descriptors
+ // before executing the command inside the namespace
+@@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error {
+ 	if err := setupUser(config); err != nil {
+ 		return err
+ 	}
++	// Make sure our final working directory is inside the container.
++	if err := verifyCwd(); err != nil {
++		return err
++	}
+ 	if err := system.ClearKeepCaps(); err != nil {
+ 		return err
+ 	}
+diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
+index 5cdc30c..e786419 100644
+--- a/libcontainer/process_linux.go
++++ b/libcontainer/process_linux.go
+@@ -1,3 +1,4 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+@@ -204,7 +205,6 @@ type initProcess struct {
+ 	process       *Process
+ 	bootstrapData io.Reader
+ 	sharePidns    bool
+-	rootDir       *os.File
+ }
+ 
+ func (p *initProcess) pid() int {
+@@ -257,7 +257,6 @@ func (p *initProcess) start() error {
+ 	err := p.cmd.Start()
+ 	p.process.ops = p
+ 	p.childPipe.Close()
+-	p.rootDir.Close()
+ 	logs.CloseChild()
+ 	if err != nil {
+ 		p.process.ops = nil
+diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
+index 1f7ec98..e38165d 100644
+--- a/libcontainer/setns_init_linux.go
++++ b/libcontainer/setns_init_linux.go
+@@ -1,3 +1,4 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+@@ -73,5 +74,23 @@ func (l *linuxSetnsInit) Init() error {
+ 		syscall.Close(l.logFd)
+ 	}
+ 
++	// Close all file descriptors we are not passing to the container. This is
++	// necessary because the execve target could use internal runc fds as the
++	// execve path, potentially giving access to binary files from the host
++	// (which can then be opened by container processes, leading to container
++	// escapes). Note that because this operation will close any open file
++	// descriptors that are referenced by (*os.File) handles from underneath
++	// the Go runtime, we must not do any file operations after this point
++	// (otherwise the (*os.File) finaliser could close the wrong file). See
++	// CVE-2024-21626 for more information as to why this protection is
++	// necessary.
++	//
++	// This is not needed for runc-dmz, because the extra execve(2) step means
++	// that all O_CLOEXEC file descriptors have already been closed and thus
++	// the second execve(2) from runc-dmz cannot access internal file
++	// descriptors from runc.
++	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
++		return err
++	}
+ 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+ }
+diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
+index 6236593..7ebf1a2 100644
+--- a/libcontainer/standard_init_linux.go
++++ b/libcontainer/standard_init_linux.go
+@@ -1,3 +1,4 @@
++//go:build linux
+ // +build linux
+ 
+ package libcontainer
+@@ -15,14 +16,17 @@ import (
+ 	"github.com/opencontainers/runc/libcontainer/keys"
+ 	"github.com/opencontainers/runc/libcontainer/seccomp"
+ 	"github.com/opencontainers/runc/libcontainer/system"
++	"github.com/opencontainers/runc/libcontainer/utils"
+ 	"github.com/opencontainers/selinux/go-selinux/label"
++
++	"golang.org/x/sys/unix"
+ )
+ 
+ type linuxStandardInit struct {
+ 	pipe          *os.File
+ 	consoleSocket *os.File
+ 	parentPid     int
+-	stateDirFD    int
++	fifoFd        int
+ 	config        *initConfig
+ 	logFd         int
+ }
+@@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error {
+ 	// exec'ing the users process.
+ 	ch := make(chan Error, 1)
+ 	go func() {
+-		fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
++		fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
+ 		if err != nil {
+ 			ch <- newSystemErrorWithCause(err, "openat exec fifo")
+ 			return
+@@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error {
+ 	}
+ 	// close the statedir fd before exec because the kernel resets dumpable in the wrong order
+ 	// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
+-	syscall.Close(l.stateDirFD)
++	unix.Close(l.fifoFd)
++	// Close all file descriptors we are not passing to the container. This is
++	// necessary because the execve target could use internal runc fds as the
++	// execve path, potentially giving access to binary files from the host
++	// (which can then be opened by container processes, leading to container
++	// escapes). Note that because this operation will close any open file
++	// descriptors that are referenced by (*os.File) handles from underneath
++	// the Go runtime, we must not do any file operations after this point
++	// (otherwise the (*os.File) finaliser could close the wrong file). See
++	// CVE-2024-21626 for more information as to why this protection is
++	// necessary.
++	//
++	// This is not needed for runc-dmz, because the extra execve(2) step means
++	// that all O_CLOEXEC file descriptors have already been closed and thus
++	// the second execve(2) from runc-dmz cannot access internal file
++	// descriptors from runc.
++	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
++		return err
++	}
+ 	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
+ 		return newSystemErrorWithCause(err, "exec user process")
+ 	}
+diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig
+deleted file mode 100644
+index 611b91d..0000000
+--- a/libcontainer/standard_init_linux.go.orig
++++ /dev/null
+@@ -1,223 +0,0 @@
+-// +build linux
+-
+-package libcontainer
+-
+-import (
+-	"fmt"
+-	"os"
+-	"os/exec"
+-	"strings"
+-	"syscall"
+-	"time"
+-
+-	"github.com/opencontainers/runc/libcontainer/apparmor"
+-	"github.com/opencontainers/runc/libcontainer/configs"
+-	"github.com/opencontainers/runc/libcontainer/keys"
+-	"github.com/opencontainers/runc/libcontainer/seccomp"
+-	"github.com/opencontainers/runc/libcontainer/system"
+-	"github.com/opencontainers/selinux/go-selinux/label"
+-)
+-
+-type linuxStandardInit struct {
+-	pipe          *os.File
+-	consoleSocket *os.File
+-	parentPid     int
+-	stateDirFD    int
+-	config        *initConfig
+-	logFd         int
+-}
+-
+-func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
+-	var newperms uint32
+-
+-	if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
+-		// with user ns we need 'other' search permissions
+-		newperms = 0x8
+-	} else {
+-		// without user ns we need 'UID' search permissions
+-		newperms = 0x80000
+-	}
+-
+-	// create a unique per session container name that we can
+-	// join in setns; however, other containers can also join it
+-	return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
+-}
+-
+-// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
+-// the kernel
+-const PR_SET_NO_NEW_PRIVS = 0x26
+-
+-func (l *linuxStandardInit) Init() error {
+-	if !l.config.Config.NoNewKeyring {
+-		ringname, keepperms, newperms := l.getSessionRingParams()
+-
+-		// do not inherit the parent's session keyring
+-		sessKeyId, err := keys.JoinSessionKeyring(ringname)
+-		if err != nil {
+-			return err
+-		}
+-		// make session keyring searcheable
+-		if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
+-			return err
+-		}
+-	}
+-
+-	if err := setupNetwork(l.config); err != nil {
+-		return err
+-	}
+-	if err := setupRoute(l.config.Config); err != nil {
+-		return err
+-	}
+-
+-	label.Init()
+-
+-	// prepareRootfs() can be executed only for a new mount namespace.
+-	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
+-		if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
+-			return err
+-		}
+-	}
+-
+-	// Set up the console. This has to be done *before* we finalize the rootfs,
+-	// but *after* we've given the user the chance to set up all of the mounts
+-	// they wanted.
+-	if l.config.CreateConsole {
+-		if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
+-			return err
+-		}
+-		if err := system.Setctty(); err != nil {
+-			return err
+-		}
+-	}
+-
+-	// Finish the rootfs setup.
+-	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
+-		if err := finalizeRootfs(l.config.Config); err != nil {
+-			return err
+-		}
+-	}
+-
+-	if hostname := l.config.Config.Hostname; hostname != "" {
+-		if err := syscall.Sethostname([]byte(hostname)); err != nil {
+-			return err
+-		}
+-	}
+-	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
+-		return err
+-	}
+-	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
+-		return err
+-	}
+-	// when userns enabled, write to sysctl will fail, let docker-hooks do this job
+-	if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 {
+-		for key, value := range l.config.Config.Sysctl {
+-			if err := writeSystemProperty(key, value); err != nil {
+-				return err
+-			}
+-		}
+-	}
+-	for _, path := range l.config.Config.ReadonlyPaths {
+-		if err := readonlyPath(path); err != nil {
+-			return err
+-		}
+-	}
+-	for _, m := range l.config.Config.Mounts {
+-		if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") {
+-			if err := remountReadWrite(m.Destination); err != nil {
+-				return err
+-			}
+-		}
+-	}
+-	for _, path := range l.config.Config.MaskPaths {
+-		if err := maskPath(path); err != nil {
+-			return err
+-		}
+-	}
+-	pdeath, err := system.GetParentDeathSignal()
+-	if err != nil {
+-		return err
+-	}
+-	if l.config.NoNewPrivileges {
+-		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+-			return err
+-		}
+-	}
+-	// Tell our parent that we're ready to Execv. This must be done before the
+-	// Seccomp rules have been applied, because we need to be able to read and
+-	// write to a socket.
+-	if err := syncParentReady(l.pipe); err != nil {
+-		return err
+-	}
+-	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
+-	// do this before dropping capabilities; otherwise do it as late as possible
+-	// just before execve so as few syscalls take place after it as possible.
+-	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
+-		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+-			return err
+-		}
+-	}
+-	if err := finalizeNamespace(l.config); err != nil {
+-		return err
+-	}
+-	// finalizeNamespace can change user/group which clears the parent death
+-	// signal, so we restore it here.
+-	if err := pdeath.Restore(); err != nil {
+-		return err
+-	}
+-	// compare the parent from the initial start of the init process and make sure that it did not change.
+-	// if the parent changes that means it died and we were reparented to something else so we should
+-	// just kill ourself and not cause problems for someone else.
+-	if syscall.Getppid() != l.parentPid {
+-		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
+-	}
+-	// check for the arg before waiting to make sure it exists and it is returned
+-	// as a create time error.
+-	name, err := exec.LookPath(l.config.Args[0])
+-	if err != nil {
+-		return err
+-	}
+-	// close the pipe to signal that we have completed our init.
+-	l.pipe.Close()
+-
+-	if l.logFd != 0 {
+-		syscall.Close(l.logFd)
+-	}
+-
+-	// wait for the fifo to be opened on the other side before
+-	// exec'ing the users process.
+-	ch := make(chan Error, 1)
+-	go func() {
+-		fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
+-		if err != nil {
+-			ch <- newSystemErrorWithCause(err, "openat exec fifo")
+-			return
+-		}
+-		if _, err := syscall.Write(fd, []byte("0")); err != nil {
+-			ch <- newSystemErrorWithCause(err, "write 0 exec fifo")
+-			return
+-		}
+-		ch <- nil
+-	}()
+-
+-	select {
+-	case chErr := <-ch:
+-		if chErr != nil {
+-			return chErr
+-		}
+-	case <-time.After(120 * time.Second):
+-		return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ")
+-	}
+-
+-	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
+-		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
+-			return newSystemErrorWithCause(err, "init seccomp")
+-		}
+-	}
+-	// close the statedir fd before exec because the kernel resets dumpable in the wrong order
+-	// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
+-	syscall.Close(l.stateDirFD)
+-	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
+-		return newSystemErrorWithCause(err, "exec user process")
+-	}
+-	return nil
+-}
+diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
+index cd04ace..922cffb 100644
+--- a/libcontainer/utils/utils.go
++++ b/libcontainer/utils/utils.go
+@@ -5,17 +5,12 @@ import (
+ 	"encoding/binary"
+ 	"encoding/hex"
+ 	"encoding/json"
+-	"fmt"
+ 	"io"
+ 	"os"
+ 	"path/filepath"
+-	"strconv"
+ 	"strings"
+ 	"syscall"
+ 	"unsafe"
+-
+-	securejoin "github.com/cyphar/filepath-securejoin"
+-	"golang.org/x/sys/unix"
+ )
+ 
+ const (
+@@ -175,36 +170,3 @@ func stripRoot(root, path string) string {
+ 	}
+ 	return CleanPath("/" + path)
+ }
+-
+-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+-// corresponding to the unsafePath resolved within the root. Before passing the
+-// fd, this path is verified to have been inside the root -- so operating on it
+-// through the passed fdpath should be safe. Do not access this path through
+-// the original path strings, and do not attempt to use the pathname outside of
+-// the passed closure (the file handle will be freed once the closure returns).
+-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+-	// Remove the root then forcefully resolve inside the root.
+-	unsafePath = stripRoot(root, unsafePath)
+-	path, err := securejoin.SecureJoin(root, unsafePath)
+-	if err != nil {
+-		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
+-	}
+-
+-	// Open the target path.
+-	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+-	if err != nil {
+-		return fmt.Errorf("open o_path procfd: %w", err)
+-	}
+-	defer fh.Close()
+-
+-	// Double-check the path is the one we expected.
+-	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
+-	if realpath, err := os.Readlink(procfd); err != nil {
+-		return fmt.Errorf("procfd verification failed: %w", err)
+-	} else if realpath != path {
+-		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+-	}
+-
+-	// Run the closure.
+-	return fn(procfd)
+-}
+diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
+index 7b798cc..cfacfc2 100644
+--- a/libcontainer/utils/utils_unix.go
++++ b/libcontainer/utils/utils_unix.go
+@@ -1,43 +1,264 @@
++///go:build !windows
++//go:build !windows
+ // +build !windows
+ 
+ package utils
+ 
+ import (
+-	"io/ioutil"
++	"fmt"
++	"math"
+ 	"os"
++	"path/filepath"
++	"runtime"
+ 	"strconv"
+-	"syscall"
++	"sync"
++	_ "unsafe" // for go:linkname
++
++	securejoin "github.com/cyphar/filepath-securejoin"
++	"github.com/Sirupsen/logrus"
++	"golang.org/x/sys/unix"
+ )
+ 
+-func CloseExecFrom(minFd int) error {
+-	fdList, err := ioutil.ReadDir("/proc/self/fd")
++// EnsureProcHandle returns whether or not the given file handle is on procfs.
++func EnsureProcHandle(fh *os.File) error {
++	var buf unix.Statfs_t
++	if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
++		return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
++	}
++	if buf.Type != unix.PROC_SUPER_MAGIC {
++		return fmt.Errorf("%s is not on procfs", fh.Name())
++	}
++	return nil
++}
++
++var (
++	haveCloseRangeCloexecBool bool
++	haveCloseRangeCloexecOnce sync.Once
++)
++
++func haveCloseRangeCloexec() bool {
++	haveCloseRangeCloexecOnce.Do(func() {
++		// Make sure we're not closing a random file descriptor.
++		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
++		if err != nil {
++			return
++		}
++		defer unix.Close(tmpFd)
++
++		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
++		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
++		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
++		// other potential error would imply that even the most basic close
++		// operation wouldn't work.
++		haveCloseRangeCloexecBool = err == nil
++	})
++	return haveCloseRangeCloexecBool
++}
++
++type fdFunc func(fd int)
++
++// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
++// the current process.
++func fdRangeFrom(minFd int, fn fdFunc) error {
++	procSelfFd, closer := ProcThreadSelf("fd")
++	defer closer()
++
++	fdDir, err := os.Open(procSelfFd)
++	if err != nil {
++		return err
++	}
++	defer fdDir.Close()
++
++	if err := EnsureProcHandle(fdDir); err != nil {
++		return err
++	}
++
++	fdList, err := fdDir.Readdirnames(-1)
+ 	if err != nil {
+ 		return err
+ 	}
+-	for _, fi := range fdList {
+-		fd, err := strconv.Atoi(fi.Name())
++	for _, fdStr := range fdList {
++		fd, err := strconv.Atoi(fdStr)
++		// Ignore non-numeric file names.
+ 		if err != nil {
+-			// ignore non-numeric file names
+ 			continue
+ 		}
+-
++		// Ignore descriptors lower than our specified minimum.
+ 		if fd < minFd {
+-			// ignore descriptors lower than our specified minimum
+ 			continue
+ 		}
+-
+-		// intentionally ignore errors from syscall.CloseOnExec
+-		syscall.CloseOnExec(fd)
+-		// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
++		// Ignore the file descriptor we used for readdir, as it will be closed
++		// when we return.
++		if uintptr(fd) == fdDir.Fd() {
++			continue
++		}
++		// Run the closure.
++		fn(fd)
+ 	}
+ 	return nil
+ }
+ 
+-// NewSockPair returns a new unix socket pair
+-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+-	fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
++// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
++// equal to minFd in the current process.
++func CloseExecFrom(minFd int) error {
++	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
++	if haveCloseRangeCloexec() {
++		err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC)
++		return os.NewSyscallError("close_range", err)
++	}
++	// Otherwise, fall back to the standard loop.
++	return fdRangeFrom(minFd, unix.CloseOnExec)
++}
++
++//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
++
++// In order to make sure we do not close the internal epoll descriptors the Go
++// runtime uses, we need to ensure that we skip descriptors that match
++// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
++// unfortunately there's no other way to be sure we're only keeping the file
++// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
++func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
++
++// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
++// current process, except for those critical to Go's runtime (such as the
++// netpoll management descriptors).
++//
++// NOTE: That this function is incredibly dangerous to use in most Go code, as
++// closing file descriptors from underneath *os.File handles can lead to very
++// bad behaviour (the closed file descriptor can be re-used and then any
++// *os.File operations would apply to the wrong file). This function is only
++// intended to be called from the last stage of runc init.
++func UnsafeCloseFrom(minFd int) error {
++	// We cannot use close_range(2) even if it is available, because we must
++	// not close some file descriptors.
++	return fdRangeFrom(minFd, func(fd int) {
++		if runtime_IsPollDescriptor(uintptr(fd)) {
++			// These are the Go runtimes internal netpoll file descriptors.
++			// These file descriptors are operated on deep in the Go scheduler,
++			// and closing those files from underneath Go can result in panics.
++			// There is no issue with keeping them because they are not
++			// executable and are not useful to an attacker anyway. Also we
++			// don't have any choice.
++			return
++		}
++		// There's nothing we can do about errors from close(2), and the
++		// only likely error to be seen is EBADF which indicates the fd was
++		// already closed (in which case, we got what we wanted).
++		_ = unix.Close(fd)
++	})
++}
++
++// NewSockPair returns a new SOCK_STREAM unix socket pair.
++func NewSockPair(name string) (parent, child *os.File, err error) {
++	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+ 	if err != nil {
+ 		return nil, nil, err
+ 	}
+ 	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
+ }
++
++// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
++// corresponding to the unsafePath resolved within the root. Before passing the
++// fd, this path is verified to have been inside the root -- so operating on it
++// through the passed fdpath should be safe. Do not access this path through
++// the original path strings, and do not attempt to use the pathname outside of
++// the passed closure (the file handle will be freed once the closure returns).
++func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
++	// Remove the root then forcefully resolve inside the root.
++	unsafePath = stripRoot(root, unsafePath)
++	path, err := securejoin.SecureJoin(root, unsafePath)
++	if err != nil {
++		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
++	}
++
++	procSelfFd, closer := ProcThreadSelf("fd/")
++	defer closer()
++
++	// Open the target path.
++	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
++	if err != nil {
++		return fmt.Errorf("open o_path procfd: %w", err)
++	}
++	defer fh.Close()
++
++	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
++	// Double-check the path is the one we expected.
++	if realpath, err := os.Readlink(procfd); err != nil {
++		return fmt.Errorf("procfd verification failed: %w", err)
++	} else if realpath != path {
++		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
++	}
++
++	return fn(procfd)
++}
++
++type ProcThreadSelfCloser func()
++
++var (
++	haveProcThreadSelf     bool
++	haveProcThreadSelfOnce sync.Once
++)
++
++// ProcThreadSelf returns a string that is equivalent to
++// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
++// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
++// meaning that the passed string needs to be trusted. The caller _must_ call
++// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
++// *only once* after it has finished using the returned path string.
++func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
++	haveProcThreadSelfOnce.Do(func() {
++		if _, err := os.Stat("/proc/thread-self/"); err == nil {
++			haveProcThreadSelf = true
++		} else {
++			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
++		}
++	})
++
++	// We need to lock our thread until the caller is done with the path string
++	// because any non-atomic operation on the path (such as opening a file,
++	// then reading it) could be interrupted by the Go runtime where the
++	// underlying thread is swapped out and the original thread is killed,
++	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
++	// addition, the pre-3.17 fallback makes everything non-atomic because the
++	// same thing could happen between unix.Gettid() and the path operations.
++	//
++	// In theory, we don't need to lock in the atomic user case when using
++	// /proc/thread-self/, but it's better to be safe than sorry (and there are
++	// only one or two truly atomic users of /proc/thread-self/).
++	runtime.LockOSThread()
++
++	threadSelf := "/proc/thread-self/"
++	if !haveProcThreadSelf {
++		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
++		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
++		if _, err := os.Stat(threadSelf); err != nil {
++			// Unfortunately, this code is called from rootfs_linux.go where we
++			// are running inside the pid namespace of the container but /proc
++			// is the host's procfs. Unfortunately there is no real way to get
++			// the correct tid to use here (the kernel age means we cannot do
++			// things like set up a private fsopen("proc") -- even scanning
++			// NSpid in all of the tasks in /proc/self/task/*/status requires
++			// Linux 4.1).
++			//
++			// So, we just have to assume that /proc/self is acceptable in this
++			// one specific case.
++			if os.Getpid() == 1 {
++				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
++			} else {
++				// This should never happen, but the fallback should work in most cases...
++				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
++			}
++			threadSelf = "/proc/self/"
++		}
++	}
++	return threadSelf + subpath, runtime.UnlockOSThread
++}
++
++// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
++// create a /proc/thread-self handle for given file descriptor.
++//
++// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
++// without using fmt.Sprintf to avoid unneeded overhead.
++func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
++	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
++}
+diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go
+index ce67a59..e8d1081 100644
+--- a/vendor/golang.org/x/sys/unix/flock.go
++++ b/vendor/golang.org/x/sys/unix/flock.go
+@@ -14,6 +14,11 @@ import "unsafe"
+ // systems by flock_linux_32bit.go to be SYS_FCNTL64.
+ var fcntl64Syscall uintptr = SYS_FCNTL
+ 
++// FcntlInt performs a fcntl syscall on fd with the provided command and argument.
++func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
++	return fcntl(int(fd), cmd, arg)
++}
++
+ // FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command.
+ func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error {
+ 	_, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk)))
+diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+index f21dcd9..e1bde81 100644
+--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+@@ -934,6 +934,7 @@ const (
+ 	PRIO_PGRP                        = 0x1
+ 	PRIO_PROCESS                     = 0x0
+ 	PRIO_USER                        = 0x2
++	PROC_SUPER_MAGIC                 = 0x9fa0
+ 	PROT_EXEC                        = 0x4
+ 	PROT_GROWSDOWN                   = 0x1000000
+ 	PROT_GROWSUP                     = 0x2000000
+diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+index 16a18f5..388d1fc 100644
+--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
++++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+@@ -966,6 +966,7 @@ const (
+ 	PRIO_PGRP                        = 0x1
+ 	PRIO_PROCESS                     = 0x0
+ 	PRIO_USER                        = 0x2
++	PROC_SUPER_MAGIC                 = 0x9fa0
+ 	PROT_EXEC                        = 0x4
+ 	PROT_GROWSDOWN                   = 0x1000000
+ 	PROT_GROWSUP                     = 0x2000000
+diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
+index 8b2e87d..fe21f83 100644
+--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
+@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
+ 
+ // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+ 
++func CloseRange(first uint, last uint, flags uint) (err error) {
++	_, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
++	if e1 != 0 {
++		err = errnoErr(e1)
++	}
++	return
++}
++
++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
++
+ func Dup(oldfd int) (fd int, err error) {
+ 	r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
+ 	fd = int(r0)
+diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
+index f6cc320..395e2de 100644
+--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
++++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
+@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
+ 
+ // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+ 
++func CloseRange(first uint, last uint, flags uint) (err error) {
++	_, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
++	if e1 != 0 {
++		err = errnoErr(e1)
++	}
++	return
++}
++
++// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
++
+ func Dup(oldfd int) (fd int, err error) {
+ 	r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
+ 	fd = int(r0)
+diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+index 9042317..f7c427c 100644
+--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+@@ -338,4 +338,5 @@ const (
+ 	SYS_PKEY_MPROTECT          = 329
+ 	SYS_PKEY_ALLOC             = 330
+ 	SYS_PKEY_FREE              = 331
++	SYS_CLOSE_RANGE            = 436
+ )
+diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+index 90e43d0..530563a 100644
+--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
++++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+@@ -282,4 +282,5 @@ const (
+ 	SYS_PKEY_MPROTECT          = 288
+ 	SYS_PKEY_ALLOC             = 289
+ 	SYS_PKEY_FREE              = 290
++	SYS_CLOSE_RANGE            = 436
+ )
+diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
+index c9e1e64..2f12811 100644
+--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
+@@ -345,6 +345,11 @@ type TCPInfo struct {
+ 	Total_retrans  uint32
+ }
+ 
++const (
++	CLOSE_RANGE_UNSHARE = 0x2
++	CLOSE_RANGE_CLOEXEC = 0x4
++)
++
+ const (
+ 	SizeofSockaddrInet4     = 0x10
+ 	SizeofSockaddrInet6     = 0x1c
+diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
+index e58c500..b77eceb 100644
+--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
++++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
+@@ -30,6 +30,11 @@ type Timeval struct {
+ 	Usec int64
+ }
+ 
++const (
++	CLOSE_RANGE_UNSHARE = 0x2
++	CLOSE_RANGE_CLOEXEC = 0x4
++)
++
+ type Timex struct {
+ 	Modes     uint32
+ 	Pad_cgo_0 [4]byte
+-- 
+2.33.0
+
diff --git a/runc.spec b/runc.spec
index d553193..68b20d9 100644
--- a/runc.spec
+++ b/runc.spec
@@ -2,7 +2,7 @@
 
 Name: docker-runc
 Version: 1.0.0.rc3
-Release: 222
+Release: 223
 Summary: runc is a CLI tool for spawning and running containers according to the OCI specification.
 
 License: ASL 2.0
@@ -41,6 +41,12 @@ install -p -m 755 runc $RPM_BUILD_ROOT/%{_bindir}/runc
 %{_bindir}/runc
 
 %changelog
+* Thu Feb 1 2024 zhongjiawei<zhongjiawei1@huawei.com> - 1.0.0.rc3-223
+- Type:CVE
+- CVE:CVE-2024-21626
+- SUG:NA
+- DESC:fix RootDir fd leaks
+
 * Fri Dec 8 2023 zhongjiawei<zhongjiawei1@huawei.com> - 1.0.0.rc3-222
 - Type:bugfix
 - CVE:NA
diff --git a/series.conf b/series.conf
index 07b2e76..f9e40a3 100644
--- a/series.conf
+++ b/series.conf
@@ -139,3 +139,4 @@
 0145-runc-libcontainer-create-Cwd-when-it-does-not-exist.patch
 0146-runc-delete-do-not-ignore-error-from-destroy.patch
 0147-runc-libct-Destroy-don-t-proceed-in-case-of-errors.patch
+0148-runc-fix-CVE-2024-21626.patch
-- 
Gitee