From df4eae457b8ccffa619c659c2def5c777d8ff507 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Mon, 26 Dec 2022 12:04:26 +0900 Subject: [PATCH] rootless: fix /sys/fs/cgroup mounts It was found that rootless runc makes `/sys/fs/cgroup` writable in following conditons: 1. when runc is executed inside the user namespace, and the config.json does not specify the cgroup namespace to be unshared (e.g.., `(docker|podman|nerdctl) run --cgroupns=host`, with Rootless Docker/Podman/nerdctl) 2. or, when runc is executed outside the user namespace, and `/sys` is mounted with `rbind, ro` (e.g., `runc spec --rootless`; this condition is very rare) A container may gain the write access to user-owned cgroup hierarchy `/sys/fs/cgroup/user.slice/...` on the host. Other users's cgroup hierarchies are not affected. To fix the issue, this commit does: 1. Remount `/sys/fs/cgroup` to apply `MS_RDONLY` when it is being bind-mounted 2. Mask `/sys/fs/cgroup` when the bind source is unavailable Fix CVE-2023-25809 (GHSA-m8cg-xc2p-r3fc) Co-authored-by: Kir Kolyshkin Signed-off-by: Akihiro Suda --- libcontainer/rootfs_linux.go | 53 ++++++++++++++++++++++------------- tests/integration/mounts.bats | 17 +++++++++++ 2 files changed, 51 insertions(+), 19 deletions(-) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 2a98372b561..2e0e3770c2f 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -306,26 +306,41 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(dest, 0o755); err != nil { return err } - return utils.WithProcfd(c.root, m.Destination, func(procfd string) error { - if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil { - // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) - if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) { - src := fs2.UnifiedMountpoint - if c.cgroupns && c.cgroup2Path != "" { - // Emulate cgroupns by bind-mounting - // the container cgroup path rather than - // the whole /sys/fs/cgroup. - src = c.cgroup2Path - } - err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "") - if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { - err = nil - } - } - return err - } - return nil + err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error { + return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data) }) + if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { + return err + } + + // When we are in UserNS but CgroupNS is not unshared, we cannot mount + // cgroup2 (#2158), so fall back to bind mount. + bindM := &configs.Mount{ + Device: "bind", + Source: fs2.UnifiedMountpoint, + Destination: m.Destination, + Flags: unix.MS_BIND | m.Flags, + PropagationFlags: m.PropagationFlags, + } + if c.cgroupns && c.cgroup2Path != "" { + // Emulate cgroupns by bind-mounting the container cgroup path + // rather than the whole /sys/fs/cgroup. + bindM.Source = c.cgroup2Path + } + // mountToRootfs() handles remounting for MS_RDONLY. + // No need to set c.fd here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate(). + err = mountToRootfs(bindM, c) + if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { + // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed + // outside the userns+mountns. + // + // Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted + // with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`). + err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error { + return maskPath(procfd, c.label) + }) + } + return err } func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) { diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats index 1ec675aca55..1e72c5b142f 100644 --- a/tests/integration/mounts.bats +++ b/tests/integration/mounts.bats @@ -63,3 +63,20 @@ function teardown() { runc run test_busybox [ "$status" -eq 0 ] } + +# /~https://github.com/opencontainers/runc/security/advisories/GHSA-m8cg-xc2p-r3fc +@test "runc run [ro /sys/fs/cgroup mount]" { + # With cgroup namespace + update_config '.process.args |= ["sh", "-euc", "for f in `grep /sys/fs/cgroup /proc/mounts | awk \"{print \\\\$2}\"| uniq`; do grep -w $f /proc/mounts | tail -n1; done"]' + runc run test_busybox + [ "$status" -eq 0 ] + [ "${#lines[@]}" -ne 0 ] + for line in "${lines[@]}"; do [[ "${line}" == *'ro,'* ]]; done + + # Without cgroup namespace + update_config '.linux.namespaces -= [{"type": "cgroup"}]' + runc run test_busybox + [ "$status" -eq 0 ] + [ "${#lines[@]}" -ne 0 ] + for line in "${lines[@]}"; do [[ "${line}" == *'ro,'* ]]; done +}