From 0b59f67c3ae37c43f065c6deb4fe056668f02ed6 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 2 Aug 2024 11:11:07 +0200 Subject: [PATCH 1/2] add epbf program to trace podman cleanup errors Add a new program based on bpftrace[1] to trace all podman processes with arguments and exit code/signals. Additionally this captures stderr from all podman container cleanup processes spawned by conmon which otherwise go to /dev/null and are never seen in any CI logs. Hopefull this allows us to debug strange network cleanup error seen in CI, my plan is to add this to the cirrus setup and upload the logs so we can check them when the flakes happen. [1] /~https://github.com/bpftrace/bpftrace Signed-off-by: Paul Holzinger --- hack/podman_cleanup_tracer.bt | 151 ++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 hack/podman_cleanup_tracer.bt diff --git a/hack/podman_cleanup_tracer.bt b/hack/podman_cleanup_tracer.bt new file mode 100755 index 0000000000..42c46cde9a --- /dev/null +++ b/hack/podman_cleanup_tracer.bt @@ -0,0 +1,151 @@ +#!/usr/bin/env bpftrace + +// Traces podman and conmon commands for there exit status and all process that +// send signals to them. Also prints stderr for all podman container cleanup processes. +// The goal is to be able to trace all podman cleanup errors, while there is an +// option to log the errors to the syslog that only works of the process if the +// process got there and not if it was killed before. This program here will trace +// the exit status as well so we know even if it was killed by a signal. +// +// This is script uses /~https://github.com/bpftrace/bpftrace to compile this into +// kernel ebpf. +// +// Usage: sudo ./podman_cleanup_tracer.bt +// see below for output format + +BEGIN { + print("Output format is one of:"); + print("exec TIME PID PPID COMM ARGV(truncated)"); + print("cmd TIME PID PPID ARGV(full)"); + print("exit TIME PID PPID COMM EXIT_CODE EXIT_SIGNAL"); + print("kill TIME PID PPID COMM SIGNAL TPID RET"); + print("stderr TIME PID PPID COMM OUTPUT"); +} + +// Trace all exec calls to find all podman + conmon processes, it will also catch +// netavark and aardvark-dns as they have podman in the path as well but this is +// good so we can see any errors there as well. +tracepoint:syscalls:sys_enter_exec* +/ strcontains(str(args.argv[0]),"podman") || strcontains(str(args.argv[0]), "conmon") / +{ + + // create entry in pid map so we can check the pid later + @pids[pid] = 1; + + // Find the podman cleanup process spawned by conmon. + // I tried matching argv but there seems to be no way to iterate over it. + // In practise parent name conmon and argv0 podman should contain all the + // cleanup processes we care about. + if (comm == "conmon" && strcontains(str(args.argv[0]), "podman")) { + @cleanupPids[pid] = 1; + } + + printf("%-6s %s %-8d %-8d %-12s ", + "exec", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm + ); + + // Unfortunately this doesn't print the full argv0 as there is some limit, + // as such we use the /proc/<>/cmdline reading hack below on exec exit. + join(args->argv); +} + + +// Print the full cmdline +tracepoint:syscalls:sys_exit_exec* +/ @pids[pid] / +{ + printf("%-6s %s %-8d %-8d ", + "cmd", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid + ); + // This can fail to open the file it is done in user space and + // thus racy if the process exits quickly. + cat("/proc/%d/cmdline", pid); + print(""); +} + + +// Trace all exits for the pids we matched above, +// pid == tid is used to only match the main exit +// and not all thread exits which we do not care about. +tracepoint:sched:sched_process_exit +/ @pids[pid] && pid == tid / +{ + printf("%-6s %s %-8d %-8d %-12s %d %d\n", + "exit", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm, + curtask->exit_code >> 8, // actual exit code + curtask->exit_code & 0xFF // signal number if killed + ); + + // process is done remove pid from map + delete(@pids[pid]); + delete(@cleanupPids[pid]); +} + +// Trace all kill calls that target our pids. +// This will not catch signals send via pidfd_send_signal because +// I don't see a way to translate the pidfd to our target pids and +// I don't want to log all send signals on the system. +tracepoint:syscalls:sys_enter_kill +/ @pids[args.pid] / +{ + @tpid[tid] = args.pid; + @tsig[tid] = args.sig; +} + +tracepoint:syscalls:sys_exit_kill +/ @tpid[tid] / +{ + printf("%-6s %s %-8d %-8d %-12s %d %-8d %d\n", + "kill", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm, + @tsig[tid], + @tpid[tid], + args.ret + ); + delete(@tpid[tid]); + delete(@tsig[tid]); +} + + +// Print anything written on stderr for the podman container cleanup process. +tracepoint:syscalls:sys_enter_write +/ @cleanupPids[pid] && args.fd == 2 / +{ + printf("%-6s %s %-8d %-8d %-12s", + "stderr", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm + ); + + // String size limit is is 64 by default, this includes the 0 byte and when we + // hit the string limit it also adds "..." when using 63 so we use 63 as len here. + // While upstream fixed these low string limits (/~https://github.com/bpftrace/bpftrace/issues/305) + // it is not yet in older distro version we use so we cannot use that yet. + // Thus manually print several times. + $len = 62; + $offset = 62; + printf("%s", str(args.buf, $len)); + + unroll(10) { + if ((int64)args.count > $offset ) { + printf("%s", str(args.buf + $offset, $len)); + } + $offset += $len + } +} From 09ba5db035dbe9256411daa16839772c10cdaf7f Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 2 Aug 2024 17:55:01 +0200 Subject: [PATCH 2/2] CI: run and collect cleanup tracer logs In order to get better debug data for cleanup flakes. The argv is printed with 0 bytes so replace them with spaces to make the log readable for humans. Signed-off-by: Paul Holzinger --- .cirrus.yml | 3 +++ .gitignore | 1 + contrib/cirrus/runner.sh | 14 ++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 23c0be84a5..cbdaf422eb 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -401,6 +401,9 @@ bindings_task: server_log_artifacts: path: ./podman-server.log type: text/plain + cleanup_tracer_artifacts: + path: ./podman-cleanup-tracer.log + type: text/plain df_script: '$SCRIPT_BASE/logcollector.sh df' audit_log_script: '$SCRIPT_BASE/logcollector.sh audit' journal_script: '$SCRIPT_BASE/logcollector.sh journal' diff --git a/.gitignore b/.gitignore index d1813cc8fb..1b596b2b44 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,4 @@ contrib/win-installer/shasums contrib/win-installer/*.wixobj contrib/win-installer/*.wixpdb contrib/win-installer/*.log +podman-cleanup-tracer.log* diff --git a/contrib/cirrus/runner.sh b/contrib/cirrus/runner.sh index ea6235eba4..8d4dddd69d 100755 --- a/contrib/cirrus/runner.sh +++ b/contrib/cirrus/runner.sh @@ -458,6 +458,15 @@ msg "************************************************************" ((${SETUP_ENVIRONMENT:-0})) || \ die "Expecting setup_environment.sh to have completed successfully" +if [[ "$UID" -eq 0 ]] && ((CONTAINER==0)); then + # start ebpf cleanup tracer (#23487) + msg "start ebpf cleanup tracer" + # replace zero bytes to make the log more readable + bpftrace $GOSRC/hack/podman_cleanup_tracer.bt |& \ + tr '\0' ' ' >$GOSRC/podman-cleanup-tracer.log & + TRACER_PID=$! +fi + # shellcheck disable=SC2154 if [[ "$PRIV_NAME" == "rootless" ]] && [[ "$UID" -eq 0 ]]; then # Remove /var/lib/cni, it is not required for rootless cni. @@ -499,4 +508,9 @@ fi showrun $handler +if [[ -n "$TRACER_PID" ]]; then + # ignore any error here + kill "$TRACER_PID" || true +fi + showrun echo "finished"