From 9208b1d7b5ea95fdd007175c8f8666fe6252fde0 Mon Sep 17 00:00:00 2001 From: lifubang Date: Thu, 4 Apr 2024 00:23:47 +0800 Subject: [PATCH] use PR_SET_CHILD_SUBREAPER and fork to replace clone(CLONE_PARENT) in nsexec As the description in #4233, there is a bug in glibc, pthread_self() will return wrong info after we do `clone(CLONE_PARENT)` in libct/nsenter, it will cause runc can't work in `go 1.22.*`. So we use fork(2) to replace clone(2) in libct/nsenter, but there is a double-fork in nsenter, so we need to use `PR_SET_CHILD_SUBREAPER` to let runc can reap grand child process in libct/nsenter. Signed-off-by: lifubang --- libcontainer/container_linux.go | 7 +++++ libcontainer/nsenter/nsexec.c | 52 +++++++++++---------------------- 2 files changed, 24 insertions(+), 35 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 13be71ccb89..59f72c62e79 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -341,6 +341,13 @@ func (c *Container) start(process *Process) (retErr error) { if err := utils.CloseExecFrom(3); err != nil { return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) } + + // Tell the kernel that runc wants to reap orphaned children of the + // `runc init` process. + if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil { + return fmt.Errorf("unable to set child subreaper: %w", err) + } + if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index c771ac7e116..63b1b37a7c1 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -52,15 +52,7 @@ enum sync_t { /* Stores the current stage of nsexec. */ int current_stage = STAGE_SETUP; -/* Assume the stack grows down, so arguments should be above it. */ -struct clone_t { - /* - * Reserve some space for clone() to locate arguments - * and retcode in this place - */ - char stack[4096] __attribute__((aligned(16))); - char stack_ptr[0]; - +struct fork_t { /* There's two children. This is used to execute the different code. */ jmp_buf *env; int jmpval; @@ -307,19 +299,24 @@ static void update_oom_score_adj(char *data, size_t len) static int child_func(void *arg) __attribute__((noinline)); static int child_func(void *arg) { - struct clone_t *ca = (struct clone_t *)arg; + struct fork_t *ca = (struct fork_t *)arg; longjmp(*ca->env, ca->jmpval); } -static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); -static int clone_parent(jmp_buf *env, int jmpval) +static int fork_and_run(jmp_buf *env, int jmpval) __attribute__((noinline)); +static int fork_and_run(jmp_buf *env, int jmpval) { - struct clone_t ca = { + struct fork_t ca = { .env = env, .jmpval = jmpval, }; - return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); + pid_t pid = fork(); + if (pid < 0) + bail("failed to fork"); + if (pid == 0) + return child_func(&ca); + return pid; } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ @@ -644,7 +641,7 @@ void nsexec(void) * * Unfortunately, it's not as simple as that. We have to fork to enter the * PID namespace (the PID namespace only applies to children). Since we'll - * have to double-fork, this clone_parent() call won't be able to get the + * have to double-fork, this fork() call won't be able to get the * PID of the _actual_ init process (without doing more synchronisation than * I can deal with at the moment). So we'll just get the parent to send it * for us, the only job of this process is to update @@ -655,15 +652,6 @@ void nsexec(void) * will be in that namespace (and it will not be able to give us a PID value * that makes sense without resorting to sending things with cmsg). * - * This also deals with an older issue caused by dumping cloneflags into - * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so - * we have to unshare(2) before clone(2) in order to do this. This was fixed - * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're - * aware, the last mainline kernel which had this bug was Linux 3.12. - * However, we cannot comment on which kernels the broken patch was - * backported to. - * * -- Aleksa "what has my life come to?" Sarai */ @@ -687,7 +675,7 @@ void nsexec(void) /* Start the process of getting a container. */ write_log(DEBUG, "spawn stage-1"); - stage1_pid = clone_parent(&env, STAGE_CHILD); + stage1_pid = fork_and_run(&env, STAGE_CHILD); if (stage1_pid < 0) bail("unable to spawn stage-1"); @@ -755,8 +743,7 @@ void nsexec(void) /* * Send both the stage-1 and stage-2 pids back to runc. * runc needs the stage-2 to continue process management, - * but because stage-1 was spawned with CLONE_PARENT we - * cannot reap it within stage-0 and thus we need to ask + * and ask * runc to reap the zombie for us. */ write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", @@ -892,9 +879,8 @@ void nsexec(void) } /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal stage-0 to do the mapping for - * us. + * We don't have the privileges to do any mapping here. + * So signal stage-0 to do the mapping for us. */ write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; @@ -925,10 +911,6 @@ void nsexec(void) * ordering might break in the future (especially with rootless * containers). But for now, it's not possible to split this into * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. */ try_unshare(config.cloneflags, "remaining namespaces"); @@ -955,7 +937,7 @@ void nsexec(void) * to actually enter the new PID namespace. */ write_log(DEBUG, "spawn stage-2"); - stage2_pid = clone_parent(&env, STAGE_INIT); + stage2_pid = fork_and_run(&env, STAGE_INIT); if (stage2_pid < 0) bail("unable to spawn stage-2");