summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-04-28 15:59:16 +0200
committerjonmv <venstad@gmail.com>2023-04-28 15:59:16 +0200
commit0129d74396292f894a636061bf58dc53a2ed1a2c (patch)
treee2c3860ed29858dfa8d1afbd47b82c7e4a57065d /controller-server/src/main/java/com/yahoo
parent9c04252f9c8573bdbb787b5e3a0f671487eb412e (diff)
When a human cancels a job, do not retry it
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java5
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java14
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java2
15 files changed, 41 insertions, 26 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index 0f1bbfeb25e..91ece6733e1 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -58,6 +58,7 @@ import static com.yahoo.config.application.api.DeploymentSpec.RevisionTarget.nex
import static com.yahoo.config.provision.Environment.prod;
import static com.yahoo.config.provision.Environment.staging;
import static com.yahoo.config.provision.Environment.test;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication;
import static java.util.Comparator.comparing;
import static java.util.Comparator.naturalOrder;
@@ -1062,6 +1063,7 @@ public class DeploymentStatus {
&& job.isNodeAllocationFailure()) return Readiness.empty;
if (job.lastStatus().get() == invalidApplication) return new Readiness(status.now.plus(Duration.ofSeconds(1 << 30)), DelayCause.invalidPackage);
+ if (job.lastStatus().get() == cancelled) return new Readiness(status.now.plus(Duration.ofSeconds(1 << 30)), DelayCause.coolingDown);
Instant firstFailing = job.firstFailing().get().end().get();
Instant lastCompleted = job.lastCompleted().get().end().get();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 4e699f2c28f..0d362a402dd 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -287,7 +287,7 @@ public class DeploymentTrigger {
.toList();
controller.curator().writeRetriggerEntries(newList);
}
- controller.jobController().abort(run.id(), "force re-triggered");
+ controller.jobController().abort(run.id(), "force re-triggered", false);
return Optional.empty();
} else {
return Optional.of(reTrigger(deployment.applicationId(), jobType, reason));
@@ -413,7 +413,7 @@ public class DeploymentTrigger {
.map(scheduled -> scheduled.versions().toString())
.collect(Collectors.joining(", "));
log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked);
- controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked);
+ controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked, false);
}
});
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 52ddcfd5171..7aacd93813c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -71,6 +71,7 @@ import static com.yahoo.config.application.api.Notifications.When.failing;
import static com.yahoo.config.application.api.Notifications.When.failingCommit;
import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.active;
import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.reserved;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
@@ -836,6 +837,7 @@ public class InternalStepRunner implements StepRunner {
switch (run.status()) {
case running:
case aborted:
+ case cancelled:
case noTests:
case success:
return Optional.empty();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
index 318a6ffe820..6429a8c3cca 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
@@ -76,6 +76,7 @@ import java.util.stream.Stream;
import static com.yahoo.collections.Iterables.reversed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running;
import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded;
@@ -547,15 +548,15 @@ public class JobController {
}
/** Marks the given run as aborted; no further normal steps will run, but run-always steps will try to succeed. */
- public void abort(RunId id, String reason) {
+ public void abort(RunId id, String reason, boolean cancelledByHumans) {
locked(id, run -> {
- if (run.status() == aborted)
+ if (run.status() == aborted || run.status() == cancelled)
return run;
run.stepStatuses().entrySet().stream()
.filter(entry -> entry.getValue() == unfinished)
.forEach(entry -> log(id, entry.getKey(), INFO, "Aborting run: " + reason));
- return run.aborted();
+ return run.aborted(cancelledByHumans);
});
}
@@ -837,7 +838,7 @@ public class JobController {
/** Aborts a run and waits for it complete. */
private void abortAndWait(RunId id, Duration timeout) {
- abort(id, "replaced by new deployment");
+ abort(id, "replaced by new deployment", true);
runner.get().accept(last(id.application(), id.type()).get());
Instant doom = controller.clock().instant().plus(timeout);
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
index f3019631ee7..3318f76df6a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
@@ -17,6 +17,7 @@ import java.util.function.Function;
import java.util.function.Predicate;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure;
/**
@@ -57,7 +58,7 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> {
/** Returns the subset of jobs which are currently failing, not out of test capacity, and not aborted. */
public JobList failingHard() {
- return failing().not().outOfTestCapacity().not().withStatus(aborted);
+ return failing().not().outOfTestCapacity().not().withStatus(aborted).not().withStatus(cancelled);
}
public JobList outOfTestCapacity() {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
index 96187429afa..e2b231e0946 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
@@ -23,6 +23,7 @@ public class JobMetrics {
public static final String noTests = "deployment.noTests";
public static final String error = "deployment.error";
public static final String abort = "deployment.abort";
+ public static final String cancel = "deployment.cancel";
public static final String success = "deployment.success";
private final Metric metric;
@@ -57,6 +58,7 @@ public class JobMetrics {
case testFailure -> testFailure;
case noTests -> noTests;
case error -> error;
+ case cancelled -> cancel;
case aborted -> abort;
case success -> success;
default -> throw new IllegalArgumentException("Unexpected run status '" + status + "'");
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
index fd1e5592608..36df2aeda10 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
@@ -13,6 +13,7 @@ import java.util.Optional;
import java.util.stream.Collectors;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.noTests;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running;
@@ -110,10 +111,12 @@ public class Run {
lastTestRecord, lastVespaLogTimestamp, noNodesDownSince, convergenceSummary, Optional.empty(), dryRun, reason);
}
- public Run aborted() {
+ public Run aborted(boolean cancelledByHumans) {
requireActive();
- return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil, aborted, lastTestRecord, lastVespaLogTimestamp,
- noNodesDownSince, convergenceSummary, testerCertificate, dryRun, reason);
+ return new Run(id, steps, versions, isRedeployment, start, end, sleepUntil,
+ cancelledByHumans ? cancelled : aborted,
+ lastTestRecord, lastVespaLogTimestamp, noNodesDownSince,
+ convergenceSummary, testerCertificate, dryRun, reason);
}
public Run reset() {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
index aa727b602e1..b89e89e7002 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
@@ -38,9 +38,12 @@ public enum RunStatus {
/** Everything completed with great success! */
success,
- /** Run was abandoned, due to user intervention or job timeout. */
+ /** Run was abandoned, due to job timeout or blocking a newer target for the same job. */
aborted,
+ /** Cancelled by a human being. */
+ cancelled,
+
/** Run should be reset to its starting state. Used for production tests. */
reset
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java
index 358925d00b2..b9375eeac18 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Step.java
@@ -109,14 +109,12 @@ public enum Step {
succeeded;
public static Step.Status of(RunStatus status) {
- switch (status) {
- case success : throw new AssertionError("Unexpected run status '" + status + "'!");
- case reset :
- case aborted : return unfinished;
- case noTests :
- case running : return succeeded;
- default : return failed;
- }
+ return switch (status) {
+ case success -> throw new AssertionError("Unexpected run status '" + status + "'!");
+ case cancelled, reset, aborted -> unfinished;
+ case noTests, running -> succeeded;
+ default -> failed;
+ };
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
index de1d3ef955d..714e159b649 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
@@ -84,7 +84,7 @@ public class JobRunner extends ControllerMaintainer {
if ( ! run.hasFailed()
&& controller().clock().instant().isAfter(run.sleepUntil().orElse(run.start()).plus(jobTimeout)))
executors.execute(() -> {
- jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached");
+ jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached", false);
advance(run.id());
});
else if (run.readySteps().isEmpty())
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index 49d108d08df..4da7aa4b2bd 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -30,6 +30,7 @@ import java.util.Optional;
import java.util.TreeMap;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.cancelled;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.endpointCertificateTimeout;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
@@ -342,6 +343,7 @@ class RunSerializer {
case error -> "error";
case success -> "success";
case aborted -> "aborted";
+ case cancelled -> "cancelled";
case reset -> "reset";
};
}
@@ -359,6 +361,7 @@ class RunSerializer {
case "error" -> error;
case "success" -> success;
case "aborted" -> aborted;
+ case "cancelled" -> cancelled;
case "reset" -> reset;
default -> throw new IllegalArgumentException("No run status defined by '" + status + "'!");
};
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
index ab7f4451793..49186be2089 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java
@@ -2552,7 +2552,7 @@ public class ApplicationApiHandler extends AuditLoggingRequestHandler {
controller.applications().deactivate(id.applicationId(), id.zoneId());
controller.jobController().last(id.applicationId(), JobType.deploymentTo(id.zoneId()))
.filter(run -> ! run.hasEnded())
- .ifPresent(last -> controller.jobController().abort(last.id(), "deployment deactivated by " + request.getJDiscRequest().getUserPrincipal().getName()));
+ .ifPresent(last -> controller.jobController().abort(last.id(), "deployment deactivated by " + request.getJDiscRequest().getUserPrincipal().getName(), true));
return new MessageResponse("Deactivated " + id);
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
index 9ff8c7df18b..de777475da2 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
@@ -219,7 +219,7 @@ class JobControllerApiHandlerHelper {
Cursor responseObject = slime.setObject();
Optional<Run> run = jobs.last(id, type).flatMap(last -> jobs.active(last.id()));
if (run.isPresent()) {
- jobs.abort(run.get().id(), "aborted by " + request.getJDiscRequest().getUserPrincipal().getName());
+ jobs.abort(run.get().id(), "aborted by " + request.getJDiscRequest().getUserPrincipal().getName(), true);
responseObject.setString("message", "Aborting " + run.get().id());
}
else
@@ -230,7 +230,7 @@ class JobControllerApiHandlerHelper {
private static String nameOf(RunStatus status) {
return switch (status) {
case reset, running -> "running";
- case aborted -> "aborted";
+ case cancelled, aborted -> "aborted";
case error -> "error";
case testFailure -> "testFailure";
case noTests -> "noTests";
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java
index 814241a765c..ae1949e2214 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/Badges.java
@@ -53,11 +53,11 @@ public class Badges {
return switch (run.status()) {
case running -> switch (previous.orElse(RunStatus.success)) {
case success -> "url(#run-on-success)";
- case aborted, noTests -> "url(#run-on-warning)";
+ case cancelled, aborted, noTests -> "url(#run-on-warning)";
default -> "url(#run-on-failure)";
};
case success -> success;
- case aborted, noTests -> warning;
+ case cancelled, aborted, noTests -> warning;
default -> failure;
};
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
index 6e5635e8c8c..862fa08ab86 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
@@ -259,7 +259,7 @@ public class DeploymentApiHandler extends ThreadedHttpRequestHandler {
public static String nameOf(RunStatus status) {
return switch (status) {
case reset, running -> "running";
- case aborted -> "aborted";
+ case cancelled, aborted -> "aborted";
case error -> "error";
case testFailure -> "testFailure";
case noTests -> "noTests";