diff options
author | jonmv <venstad@gmail.com> | 2023-04-28 15:59:16 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-04-28 15:59:16 +0200 |
commit | 0129d74396292f894a636061bf58dc53a2ed1a2c (patch) | |
tree | e2c3860ed29858dfa8d1afbd47b82c7e4a57065d /controller-server/src/main/java/com/yahoo | |
parent | 9c04252f9c8573bdbb787b5e3a0f671487eb412e (diff) |
When a human cancels a job, do not retry it
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
15 files changed, 41 insertions, 26 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index 0f1bbfeb25e..91ece6733e1 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -58,6 +58,7 @@ import static com.yahoo.config.application.api.DeploymentSpec.RevisionTarget.nex import static com.yahoo.config.provision.Environment.prod; import static com.yahoo.config.provision.Environment.staging; import static com.yahoo.config.provision.Environment.test; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication; import static java.util.Comparator.comparing; import static java.util.Comparator.naturalOrder; @@ -1062,6 +1063,7 @@ public class DeploymentStatus { && job.isNodeAllocationFailure()) return Readiness.empty; if (job.lastStatus().get() == invalidApplication) return new Readiness(status.now.plus(Duration.ofSeconds(1 << 30)), DelayCause.invalidPackage); + if (job.lastStatus().get() == cancelled) return new Readiness(status.now.plus(Duration.ofSeconds(1 << 30)), DelayCause.coolingDown); Instant firstFailing = job.firstFailing().get().end().get(); Instant lastCompleted = job.lastCompleted().get().end().get(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 4e699f2c28f..0d362a402dd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -287,7 +287,7 @@ public class DeploymentTrigger { .toList(); controller.curator().writeRetriggerEntries(newList); } - controller.jobController().abort(run.id(), "force re-triggered"); + controller.jobController().abort(run.id(), "force re-triggered", false); return Optional.empty(); } else { return Optional.of(reTrigger(deployment.applicationId(), jobType, reason)); @@ -413,7 +413,7 @@ public class DeploymentTrigger { .map(scheduled -> scheduled.versions().toString()) .collect(Collectors.joining(", ")); log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked); - controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked); + controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked, false); } }); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 52ddcfd5171..7aacd93813c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -71,6 +71,7 @@ import static com.yahoo.config.application.api.Notifications.When.failing; import static com.yahoo.config.application.api.Notifications.When.failingCommit; import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.active; import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.reserved; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed; @@ -836,6 +837,7 @@ public class InternalStepRunner implements StepRunner { switch (run.status()) { case running: case aborted: + case cancelled: case noTests: case success: return Optional.empty(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java index 318a6ffe820..6429a8c3cca 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java @@ -76,6 +76,7 @@ import java.util.stream.Stream; import static com.yahoo.collections.Iterables.reversed; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running; import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded; @@ -547,15 +548,15 @@ public class JobController { } /** Marks the given run as aborted; no further normal steps will run, but run-always steps will try to succeed. */ - public void abort(RunId id, String reason) { + public void abort(RunId id, String reason, boolean cancelledByHumans) { locked(id, run -> { - if (run.status() == aborted) + if (run.status() == aborted || run.status() == cancelled) return run; run.stepStatuses().entrySet().stream() .filter(entry -> entry.getValue() == unfinished) .forEach(entry -> log(id, entry.getKey(), INFO, "Aborting run: " + reason)); - return run.aborted(); + return run.aborted(cancelledByHumans); }); } @@ -837,7 +838,7 @@ public class JobController { /** Aborts a run and waits for it complete. */ private void abortAndWait(RunId id, Duration timeout) { - abort(id, "replaced by new deployment"); + abort(id, "replaced by new deployment", true); runner.get().accept(last(id.application(), id.type()).get()); Instant doom = controller.clock().instant().plus(timeout); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index f3019631ee7..3318f76df6a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -17,6 +17,7 @@ import java.util.function.Function; import java.util.function.Predicate; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure; /** @@ -57,7 +58,7 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { /** Returns the subset of jobs which are currently failing, not out of test capacity, and not aborted. */ public JobList failingHard() { - return failing().not().outOfTestCapacity().not().withStatus(aborted); + return failing().not().outOfTestCapacity().not().withStatus(aborted).not().withStatus(cancelled); } public JobList outOfTestCapacity() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java index 96187429afa..e2b231e0946 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java @@ -23,6 +23,7 @@ public class JobMetrics { public static final String noTests = "deployment.noTests"; public static final String error = "deployment.error"; public static final String abort = "deployment.abort"; + public static final String cancel = "deployment.cancel"; public static final String success = "deployment.success"; private final Metric metric; @@ -57,6 +58,7 @@ public class JobMetrics { case testFailure -> testFailure; case noTests -> noTests; case error -> error; + case cancelled -> cancel; case aborted -> abort; case success -> success; default -> throw new IllegalArgumentException("Unexpected run status '" + status + "'"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index fd1e5592608..36df2aeda10 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -13,6 +13,7 @@ import java.util.Optional; import java.util.stream.Collectors; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.noTests; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running; @@ -110,10 +111,12 @@ public class Run { lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun, reason); } - public Run aborted() { + public Run aborted(boolean cancelledByHumans) { requireActive(); - return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, aborted, lastTestRecord, lastVespaLogTimestamp, - noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason); + return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, + cancelledByHumans ? cancelled : aborted, + lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, + convergenceSummary, testerCertificate, dryRun, reason); } public Run reset() { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java index aa727b602e1..b89e89e7002 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java @@ -38,9 +38,12 @@ public enum RunStatus { /** Everything completed with great success! */ success, - /** Run was abandoned, due to user intervention or job timeout. */ + /** Run was abandoned, due to job timeout or blocking a newer target for the same job. */ aborted, + /** Cancelled by a human being. */ + cancelled, + /** Run should be reset to its starting state. Used for production tests. */ reset diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java index 358925d00b2..b9375eeac18 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java @@ -109,14 +109,12 @@ public enum Step { succeeded; public static Step.Status of(RunStatus status) { - switch (status) { - case success : throw new AssertionError("Unexpected run status '" + status + "'!"); - case reset : - case aborted : return unfinished; - case noTests : - case running : return succeeded; - default : return failed; - } + return switch (status) { + case success -> throw new AssertionError("Unexpected run status '" + status + "'!"); + case cancelled, reset, aborted -> unfinished; + case noTests, running -> succeeded; + default -> failed; + }; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index de1d3ef955d..714e159b649 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -84,7 +84,7 @@ public class JobRunner extends ControllerMaintainer { if ( ! run.hasFailed() && controller().clock().instant().isAfter(run.sleepUntil().orElse(run.start()).plus(jobTimeout))) executors.execute(() -> { - jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached"); + jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached", false); advance(run.id()); }); else if (run.readySteps().isEmpty()) diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java index 49d108d08df..4da7aa4b2bd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java @@ -30,6 +30,7 @@ import java.util.Optional; import java.util.TreeMap; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.endpointCertificateTimeout; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error; @@ -342,6 +343,7 @@ class RunSerializer { case error -> "error"; case success -> "success"; case aborted -> "aborted"; + case cancelled -> "cancelled"; case reset -> "reset"; }; } @@ -359,6 +361,7 @@ class RunSerializer { case "error" -> error; case "success" -> success; case "aborted" -> aborted; + case "cancelled" -> cancelled; case "reset" -> reset; default -> throw new IllegalArgumentException("No run status defined by '" + status + "'!"); }; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java index ab7f4451793..49186be2089 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java @@ -2552,7 +2552,7 @@ public class ApplicationApiHandler extends AuditLoggingRequestHandler { controller.applications().deactivate(id.applicationId(), id.zoneId()); controller.jobController().last(id.applicationId(), JobType.deploymentTo(id.zoneId())) .filter(run -> ! run.hasEnded()) - .ifPresent(last -> controller.jobController().abort(last.id(), "deployment deactivated by " + request.getJDiscRequest().getUserPrincipal().getName())); + .ifPresent(last -> controller.jobController().abort(last.id(), "deployment deactivated by " + request.getJDiscRequest().getUserPrincipal().getName(), true)); return new MessageResponse("Deactivated " + id); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java index 9ff8c7df18b..de777475da2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java @@ -219,7 +219,7 @@ class JobControllerApiHandlerHelper { Cursor responseObject = slime.setObject(); Optional<Run> run = jobs.last(id, type).flatMap(last -> jobs.active(last.id())); if (run.isPresent()) { - jobs.abort(run.get().id(), "aborted by " + request.getJDiscRequest().getUserPrincipal().getName()); + jobs.abort(run.get().id(), "aborted by " + request.getJDiscRequest().getUserPrincipal().getName(), true); responseObject.setString("message", "Aborting " + run.get().id()); } else @@ -230,7 +230,7 @@ class JobControllerApiHandlerHelper { private static String nameOf(RunStatus status) { return switch (status) { case reset, running -> "running"; - case aborted -> "aborted"; + case cancelled, aborted -> "aborted"; case error -> "error"; case testFailure -> "testFailure"; case noTests -> "noTests"; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java index 814241a765c..ae1949e2214 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java @@ -53,11 +53,11 @@ public class Badges { return switch (run.status()) { case running -> switch (previous.orElse(RunStatus.success)) { case success -> "url(#run-on-success)"; - case aborted, noTests -> "url(#run-on-warning)"; + case cancelled, aborted, noTests -> "url(#run-on-warning)"; default -> "url(#run-on-failure)"; }; case success -> success; - case aborted, noTests -> warning; + case cancelled, aborted, noTests -> warning; default -> failure; }; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java index 6e5635e8c8c..862fa08ab86 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java @@ -259,7 +259,7 @@ public class DeploymentApiHandler extends ThreadedHttpRequestHandler { public static String nameOf(RunStatus status) { return switch (status) { case reset, running -> "running"; - case aborted -> "aborted"; + case cancelled, aborted -> "aborted"; case error -> "error"; case testFailure -> "testFailure"; case noTests -> "noTests"; |