Skip to content

Commit

Permalink
libpod: remove UpdateContainerStatus()
Browse files Browse the repository at this point in the history
There are two major problems with UpdateContainerStatus()
First, it can deadlock when the the state json is to big as it tries to
read stderr until EOF but it will never hit EOF as long as the runtime
process is alive. This means if the runtime json is to big to git into
the pipe buffer we deadlock ourselves.
Second, the function modifies the container state struct and even adds
and exit code to the db however when it is called from the stop() code
path we will be unlocked here.

While the first problem is easy to fix the second one not so much. And
when we cannot update the state there is no point in reading the from
runtime in the first place as such remove the function as it does more
harm then good.

And add some warnings the the functions that might be called unlocked.

Fixes #22246

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
  • Loading branch information
Luap99 committed Aug 16, 2024
1 parent 8c132cc commit ddece75
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 100 deletions.
2 changes: 0 additions & 2 deletions libpod/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ type OCIRuntime interface { //nolint:interfacebloat
// the given container if it is a restore and if restoreOptions.PrintStats
// is true. In all other cases the returned int64 is 0.
CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error)
// UpdateContainerStatus updates the status of the given container.
UpdateContainerStatus(ctr *Container) error
// StartContainer starts the given container.
StartContainer(ctr *Container) error
// KillContainer sends the given signal to the given container.
Expand Down
105 changes: 12 additions & 93 deletions libpod/oci_conmon_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,91 +192,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta
return r.createOCIContainer(ctr, restoreOptions)
}

// UpdateContainerStatus retrieves the current status of the container from the
// runtime. It updates the container's state but does not save it.
// If useRuntime is false, we will not directly hit runc to see the container's
// status, but will instead only check for the existence of the conmon exit file
// and update state to stopped if it exists.
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
runtimeDir, err := util.GetRootlessRuntimeDir()
if err != nil {
return err
}

// Store old state so we know if we were already stopped
oldState := ctr.state.State

state := new(spec.State)

cmd := exec.Command(r.path, "state", ctr.ID())
cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir))

outPipe, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("getting stdout pipe: %w", err)
}
errPipe, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("getting stderr pipe: %w", err)
}

err = cmd.Start()
if err != nil {
return fmt.Errorf("error launching container runtime: %w", err)
}
defer func() {
_ = cmd.Wait()
}()

stderr, err := io.ReadAll(errPipe)
if err != nil {
return fmt.Errorf("reading stderr: %s: %w", ctr.ID(), err)
}
if strings.Contains(string(stderr), "does not exist") || strings.Contains(string(stderr), "No such file") {
if err := ctr.removeConmonFiles(); err != nil {
logrus.Debugf("unable to remove conmon files for container %s", ctr.ID())
}
ctr.state.ExitCode = -1
ctr.state.FinishedTime = time.Now()
ctr.state.State = define.ContainerStateExited
return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode)
}
if err := errPipe.Close(); err != nil {
return err
}

out, err := io.ReadAll(outPipe)
if err != nil {
return fmt.Errorf("reading stdout: %s: %w", ctr.ID(), err)
}
if err := json.NewDecoder(bytes.NewReader(out)).Decode(state); err != nil {
return fmt.Errorf("decoding container status for container %s: %w", ctr.ID(), err)
}
ctr.state.PID = state.Pid

switch state.Status {
case "created":
ctr.state.State = define.ContainerStateCreated
case "paused":
ctr.state.State = define.ContainerStatePaused
case "running":
ctr.state.State = define.ContainerStateRunning
case "stopped":
ctr.state.State = define.ContainerStateStopped
default:
return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w",
ctr.ID(), state.Status, define.ErrInternal)
}

// Handle ContainerStateStopping - keep it unless the container
// transitioned to no longer running.
if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) {
ctr.state.State = define.ContainerStateStopping
}

return nil
}

// StartContainer starts the given container.
// Sets time the container was started, but does not save it.
func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error {
Expand Down Expand Up @@ -358,6 +273,8 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool)

// If captureStderr is requested, OCI runtime STDERR will be captured as a
// *bytes.buffer and returned; otherwise, it is set to os.Stderr.
// IMPORTANT: Thus function is called from an unlocked container state in
// the stop() code path so do not modify the state here.
func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captureStderr bool) (*bytes.Buffer, error) {
logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
runtimeDir, err := util.GetRootlessRuntimeDir()
Expand All @@ -381,15 +298,15 @@ func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captu
stderr = stderrBuffer
}
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, stderr, env, r.path, args...); err != nil {
// Update container state - there's a chance we failed because
// the container exited in the meantime.
if err2 := r.UpdateContainerStatus(ctr); err2 != nil {
logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2)
}
if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
return stderrBuffer, fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State)
rErr := err
// quick check if ctr pid is still alive
if err := unix.Kill(ctr.state.PID, 0); err == unix.ESRCH {
// pid already dead so signal sending fails logically, set error to ErrCtrStateInvalid
// This is needed for the ProxySignals() function which already ignores the error to not cause flakes
// when the ctr process just exited.
rErr = define.ErrCtrStateInvalid
}
return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err)
return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), rErr)
}

return stderrBuffer, nil
Expand All @@ -401,6 +318,8 @@ func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captu
// immediately kill with SIGKILL.
// Does not set finished time for container, assumes you will run updateStatus
// after to pull the exit code.
// IMPORTANT: Thus function is called from an unlocked container state in
// the stop() code path so do not modify the state here.
func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error {
logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID)

Expand Down
5 changes: 0 additions & 5 deletions libpod/oci_missing.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ func (r *MissingRuntime) CreateContainer(ctr *Container, restoreOptions *Contain
return 0, r.printError()
}

// UpdateContainerStatus is not available as the runtime is missing
func (r *MissingRuntime) UpdateContainerStatus(ctr *Container) error {
return r.printError()
}

// StartContainer is not available as the runtime is missing
func (r *MissingRuntime) StartContainer(ctr *Container) error {
return r.printError()
Expand Down

0 comments on commit ddece75

Please sign in to comment.