Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: don't ignore failed exps #9693

Merged
merged 9 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/release-notes/experiments-failed-status.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
:orphan:

**Bug Fixes**

- Experiments: Report an experiment's status as FAILED if there is failed behavior while
experiments are shutting down, but have not gracefully completed.
carolinaecalderon marked this conversation as resolved.
Show resolved Hide resolved
31 changes: 31 additions & 0 deletions e2e_tests/tests/cluster/test_experiment_delete.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pathlib
import subprocess
import time

import pytest

from determined.common import api
from determined.common.api import bindings
from tests import api_utils
from tests import config as conf
from tests import detproc
Expand Down Expand Up @@ -47,3 +49,32 @@ def test_delete_experiment_removes_tensorboard_files() -> None:
return

pytest.fail(f"experiment failed to be deleted after {ticks} seconds")


@pytest.mark.e2e_k8s
@pytest.mark.e2e_single_k8s
def test_cancel_experiment_remove_k8s_pod() -> None:
# Start a random experiment.
sess = api_utils.user_session()

experiment_id = exp.create_experiment(
sess,
conf.fixtures_path("no_op/single-one-short-step.yaml"),
conf.fixtures_path("no_op"),
)
exp.wait_for_experiment_state(
sess,
experiment_id,
bindings.experimentv1State.RUNNING,
)

# Find the trial, and kill the pod.
cmd = f'kubectl delete $(kubectl get pods -o name | grep -i "exp-{experiment_id}")'
subprocess.run(cmd, shell=True)

# Cancel the experiment
command = ["det", "-m", conf.make_master_url(), "e", "cancel", str(experiment_id)]
detproc.check_call(sess, command)

# Assert that the experiment fails.
assert exp.experiment_state(sess, experiment_id) == bindings.experimentv1State.ERROR
4 changes: 2 additions & 2 deletions e2e_tests/tests/cluster/test_master_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,8 @@ def test_master_restart_stopping_container_gone(
restartable_managed_cluster.restart_master()
restartable_managed_cluster.restart_agent(wait_for_amnesia=False)

# TODO(RM-70) make this state be an error.
exp.wait_for_experiment_state(sess, exp_id, bindings.experimentv1State.CANCELED)
exp.wait_for_experiment_state(sess, exp_id, bindings.experimentv1State.ERROR)

trials = exp.experiment_trials(sess, exp_id)
assert len(trials) == 1
assert trials[0].trial.state == bindings.trialv1State.ERROR
Expand Down
9 changes: 3 additions & 6 deletions e2e_tests/tests/experiment/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,17 +368,14 @@ def test_max_concurrent_trials(name: str, searcher_cfg: str) -> None:
# Give the experiment time to refill max_concurrent_trials.
trials = exp.wait_for_at_least_n_trials(sess, experiment_id, 2)

# The experiment handling the cancel message and waiting for it to be cancelled slyly
# (hackishly) allows us to synchronize with the experiment state after after canceling
# the first two trials.
exp.cancel_single(sess, experiment_id)
# Pause the experiment to count the trials.
exp.pause_experiment(sess, experiment_id)

# Make sure that there were never more than 2 total trials created.
trials = exp.wait_for_at_least_n_trials(sess, experiment_id, 2)
assert len(trials) == 2, trials

finally:
exp.kill_single(sess, experiment_id)
exp.kill_experiments(sess, [experiment_id], -1)


@pytest.mark.e2e_cpu
Expand Down
15 changes: 13 additions & 2 deletions master/internal/experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -760,9 +760,12 @@ func (e *internalExperiment) handleContinueExperiment(reqID model.RequestID) (*i
func (e *internalExperiment) processOperations(
ops []searcher.Operation, err error,
) {
if _, ok := model.StoppingStates[e.State]; ok {
// Only continue for experiments in stopping states if the searcher operations are all
// type Shutdown failures.
if _, ok := model.StoppingStates[e.State]; ok && notASearcherShutdown(ops) {
return
}

if err != nil {
e.syslog.Error(err)
e.updateState(model.StateWithReason{
Expand Down Expand Up @@ -832,7 +835,6 @@ func (e *internalExperiment) processOperations(
if err := e.searcher.SetCustomSearcherProgress(op.Progress); err != nil {
e.syslog.WithError(err).Error("failed to set searcher progress")
}

carolinaecalderon marked this conversation as resolved.
Show resolved Hide resolved
case searcher.Close:
state := e.TrialSearcherState[op.RequestID]
state.Closed = true
Expand Down Expand Up @@ -1135,3 +1137,12 @@ func (e *internalExperiment) setRP(resourcePool string) error {

return nil
}

func notASearcherShutdown(ops []searcher.Operation) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function name made it harder to understand the intent of the function and how it was used.

What if the function was named something like allSearcherShutdown(...) and it returned false as soon as it found an operation that wasn't a shutdown error (and otherwise true)?

Then the if statement above would look like

if _, ok := model.StoppingStates[e.State]; ok && !allSearcherShutdown(ops) {

Just my 2cents. Feel free to ignore if this sounds more confusing to you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this is what I was afraid of, thanks for the feedback

for _, operation := range ops {
if _, ok := operation.(searcher.Shutdown); !ok {
return true
}
}
return false
}
Loading