Skip to content

Commit

Permalink
Improve diagnostics for kernel supervisor startup failures (#5705)
Browse files Browse the repository at this point in the history
This change improves diagnostics and logging for (hopefully rare) cases
in which the kernel supervisor _itself_ cannot start. This is distinct
from the cases where R or Python can't start. So far we've seen just one
of these in the wild, as a result of running on an unsupported OS.

The approach is to borrow the wrapper script technique from the Jupyter
Adapter (formerly used to invoke the kernels themselves). The wrapper
script acts as a sort of supervisor for the supervisor; it eats the
output of the supervisor process and writes it to a file. If the
supervisor exits unexpectedly at startup, the output file is written to
the log channel, and the user is directed there to view errors.

As an additional benefit, this runs the supervisor under a `bash`
process on Unix-alikes, so any environment variables or configuration
set up in `.bashrc` (etc) will now be available to the supervisor.

Addresses #5611 . 

May help us figure out #5337.

### QA Notes

An easy way to test this is to replace your `kcserver` binary with a
shell script that emits some nonsense and then exits immediately with a
nonzero status code. If you're feeling ambitious, you could also test
this on the OS named in #5611.

Also, did you know that the [longest worm in the
world](https://en.wikipedia.org/wiki/Lineus_longissimus) can reach up to
55 meters? Crazy.
  • Loading branch information
jmcphers authored Dec 12, 2024
1 parent fab84c8 commit 6f5ea7e
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 1 deletion.
8 changes: 8 additions & 0 deletions extensions/positron-supervisor/positron.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"binaries": [
{
"from": "resources/supervisor-wrapper.*",
"to": "resources"
}
]
}
44 changes: 44 additions & 0 deletions extensions/positron-supervisor/resources/supervisor-wrapper.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
@echo off

REM ---------------------------------------------------------------------------------------------
REM Copyright (C) 2024 Posit Software, PBC. All rights reserved.
REM ---------------------------------------------------------------------------------------------

REM This script is used to run a program and capture its output to a file. It is
REM used to capture the output of the supervisor process so that it can be displayed
REM in the UI in the case of a startup failure.

REM Check that the user provided at least two arguments; the first is the output
REM file and the second is the program to run and any arguments. If not, print a
REM usage message and exit with an error code.

if "%~2"=="" (
echo Usage: %0 ^<output-file^> ^<program^> [program-args...] >&2
exit /b 1
)

REM The first argument is the output file; consume it.
set output_file=%1
shift

REM `shift` doesn't affect `%*`, so we have to manually remove the first argument
set "args="
:parse
if "%~1" neq "" (
set args=%args% %1
shift
goto :parse
)
if defined args set args=%args:~1%

REM Print the command line to the log file
echo %args% >> "%output_file%"

REM Run the program with its arguments and capture the output
%args% >> "%output_file%"

REM Save the exit code of the program
set exit_code=%ERRORLEVEL%

REM Exit with the same code as the program so that the caller can correctly report errors
exit /b exit_code
34 changes: 34 additions & 0 deletions extensions/positron-supervisor/resources/supervisor-wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

# ---------------------------------------------------------------------------------------------
# Copyright (C) 2024 Posit Software, PBC. All rights reserved.
# Licensed under the Elastic License 2.0. See LICENSE.txt for license information.
# ---------------------------------------------------------------------------------------------

# This script is used to run a program and capture its output to a file. It is
# used to capture the output of the supervisor process so that it can be displayed
# in the UI in the case of a startup failure

# Check that the user provided at least two arguments; the first is the output
# file and the second is the program to run and any arguments. If not, print a
# usage message and exit with an error code.
if [ $# -lt 2 ]; then
echo "Usage: $0 <output-file> <program> [program-args...]" >&2
exit 1
fi

# The first argument is the output file; consume it.
output_file="$1"
shift

# Print the command line to the log file
echo "$@" >> "$output_file"

# Run the program with its arguments, redirecting stdout and stderr to the output file
"$@" >> "$output_file" 2>&1

# Save the exit code of the program
exit_code=$?

# Exit with the same code as the program so that the caller can correctly report errors
exit $exit_code
81 changes: 80 additions & 1 deletion extensions/positron-supervisor/src/KallichoreAdapterApi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ export class KCApi implements KallichoreAdapterApi {
*/
private _logStreamer: LogStreamer | undefined;

/**
* An array of disposables that need to be cleaned up when the API is disposed.
*/
private _disposables: vscode.Disposable[] = [];

/**
* Create a new Kallichore API object.
*
Expand Down Expand Up @@ -202,12 +207,21 @@ export class KCApi implements KallichoreAdapterApi {
// Create a temporary file with a random name to use for logs
const logFile = path.join(os.tmpdir(), `kallichore-${sessionId}.log`);

// Create a second file to capture the server's stdout and stderr
const outFile = path.join(os.tmpdir(), `kallichore-${sessionId}.out.log`);

// Determine the path to the wrapper script.
const wrapperName = os.platform() === 'win32' ? 'supervisor-wrapper.bat' : 'supervisor-wrapper.sh';
const wrapperPath = path.join(this._context.extensionPath, 'resources', wrapperName);

// Start the server in a new terminal
this._log.appendLine(`Starting Kallichore server ${shellPath} on port ${port}`);
const terminal = vscode.window.createTerminal(<vscode.TerminalOptions>{
name: 'Kallichore',
shellPath: shellPath,
shellPath: wrapperPath,
shellArgs: [
outFile,
shellPath,
'--port', port.toString(),
'--token', tokenPath,
'--log-level', logLevel,
Expand All @@ -219,6 +233,60 @@ export class KCApi implements KallichoreAdapterApi {
isTransient: false
});

// Flag to track if the terminal exited before the start barrier opened
let exited = false;

// Listen for the terminal to close. If it closes unexpectedly before
// the start barrier opens, provide some feedback.
const closeListener = vscode.window.onDidCloseTerminal(async (closedTerminal) => {
// Ignore closed terminals that aren't the one we started
if (closedTerminal !== terminal) {
return;
}

// Ignore if the start barrier is already open (that means the
// server started successfully), or if more than 5 minutes have elapsed
if (this._started.isOpen()) {
return;
}

// Ignore if it's been more than 5 minutes since the start time
if (Date.now() - startTime > 300000) {
return;
}

// Mark the terminal as exited
exited = true;

// Read the contents of the output file and log it
const contents = fs.readFileSync(outFile, 'utf8');
if (terminal.exitStatus && terminal.exitStatus.code) {
this._log.appendLine(`Supervisor terminal closed with exit code ${terminal.exitStatus.code}; output:\n${contents}`);
} else {
this._log.appendLine(`Supervisor terminal closed unexpectedly; output:\n${contents}`);
}

// Display a notification that directs users to open the log to get more information
const selection = await vscode.window.showInformationMessage(
vscode.l10n.t('There was an error starting the kernel supervisor. Check the log for more information.'), {
title: vscode.l10n.t('Open Log'),
execute: () => {
this._log.show(false);
}
});
if (selection) {
selection.execute();
}
});

// Ensure this listener is disposed when the API is disposed
this._disposables.push(closeListener);

// Ensure the output file is cleaned up when the API is disposed
this._disposables.push(new vscode.Disposable(() => {
fs.unlinkSync(outFile);
}));

// Wait for the terminal to start and get the PID
await terminal.processId;

Expand All @@ -237,6 +305,11 @@ export class KCApi implements KallichoreAdapterApi {
} catch (err) {
const elapsed = Date.now() - startTime;

// Has the terminal exited? if it has, there's no point in continuing to retry.
if (exited) {
throw new Error(`The supervisor process exited before the server was ready.`);
}

// ECONNREFUSED is a normal condition during startup; the server
// isn't ready yet. Keep trying until we hit the retry limit,
// about 2 seconds from the time we got a process ID
Expand Down Expand Up @@ -285,6 +358,9 @@ export class KCApi implements KallichoreAdapterApi {
this._log.appendLine(`Streaming Kallichore server logs from ${logFile} (log level: ${logLevel})`);
});

// Now that we're online, we can dispose of the close listener
closeListener.dispose();

// Open the started barrier and save the server state since we're online
this._started.open();
const state: KallichoreServerState = {
Expand Down Expand Up @@ -579,6 +655,9 @@ export class KCApi implements KallichoreAdapterApi {
this._logStreamer.dispose();
this._logStreamer = undefined;
}

// Dispose of any other disposables
this._disposables.forEach(disposable => disposable.dispose());
}

findAvailablePort(excluding: Array<number>, maxTries: number): Promise<number> {
Expand Down

0 comments on commit 6f5ea7e

Please sign in to comment.