summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-04-13 11:22:29 +0200
committerjonmv <venstad@gmail.com>2023-04-13 11:26:21 +0200
commit78c76f976497d31ede3aa7f4f9cf0bd04d8963c1 (patch)
tree63e791e70afc2276826d09e1115f9171765983a6 /controller-server
parent20cf71d92f35c2464d760ad56fde480c95b7d5f1 (diff)
Compute whether jobs are blocked in the right place, and expose
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java90
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java59
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json3
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json6
7 files changed, 94 insertions, 79 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index cd1e354135a..00da34fe2e4 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -43,6 +43,7 @@ import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@@ -302,7 +303,11 @@ public class DeploymentStatus {
fallbackPlatform(change, job));
if (step.completedAt(change, firstProductionJobWithDeploymentInCloud).isEmpty()) {
JobType typeWithZone = job.type().isSystemTest() ? JobType.systemTest(zones, cloud) : JobType.stagingTest(zones, cloud);
- jobs.merge(job, List.of(new Job(typeWithZone, versions, step.readiness(change, firstProductionJobWithDeploymentInCloud), change)), DeploymentStatus::union);
+ Readiness readiness = step.readiness(change, firstProductionJobWithDeploymentInCloud);
+ jobs.merge(job, List.of(new Job(typeWithZone,
+ versions,
+ readiness.okAt(now) && jobs().get(job).get().isRunning() ? readiness.running() : readiness,
+ change)), DeploymentStatus::union);
}
});
});
@@ -529,13 +534,15 @@ public class DeploymentStatus {
private Map<JobId, List<Job>> productionJobs(InstanceName instance, Change change, boolean assumeUpgradesSucceed) {
Map<JobId, List<Job>> jobs = new LinkedHashMap<>();
- jobSteps.forEach((job, step) -> {
+ for (Entry<JobId, StepStatus> entry : reversed(List.copyOf(jobSteps.entrySet()))) {
+ JobId job = entry.getKey();
+ StepStatus step = entry.getValue();
if ( ! job.application().instance().equals(instance) || ! job.type().isProduction())
- return;
+ continue;
// Signal strict completion criterion by depending on job itself.
if (step.completedAt(change, Optional.of(job)).isPresent())
- return;
+ continue;
// When computing eager test jobs for outstanding changes, assume current change completes successfully.
Optional<Deployment> deployment = deploymentFor(job);
@@ -545,7 +552,7 @@ public class DeploymentStatus {
|| areIncompatible(change.platform(), existingRevision, job);
if (assumeUpgradesSucceed) {
if (deployingCompatibilityChange) // No eager tests for this.
- return;
+ continue;
Change currentChange = application.require(instance).change();
Versions target = Versions.from(currentChange, application, deployment, fallbackPlatform(currentChange, job));
@@ -555,21 +562,28 @@ public class DeploymentStatus {
List<Job> toRun = new ArrayList<>();
List<Change> changes = deployingCompatibilityChange
|| allJobs.get(job).flatMap(status -> status.lastCompleted()).isEmpty()
- ? List.of(change)
- : changes(job, step, change);
+ ? List.of(change)
+ : changes(job, step, change);
for (Change partial : changes) {
- Job jobToRun = new Job(job.type(),
- Versions.from(partial, application, existingPlatform, existingRevision, fallbackPlatform(partial, job)),
- step.readiness(partial, Optional.of(job)),
- partial);
- toRun.add(jobToRun);
+ Versions versions = Versions.from(partial, application, existingPlatform, existingRevision, fallbackPlatform(partial, job));
+ Readiness readiness = step.readiness(partial, Optional.of(job));
+ // This job is blocked if it is already running ...
+ readiness = jobs().get(job).get().isRunning() && readiness.okAt(now) ? readiness.running() : readiness;
+ // ... or if it is a deployment, and a test job for the current state is not yet complete,
+ // which is the case when the next versions to run that test with is not the same as we want to deploy here.
+ List<Job> tests = job.type().isTest() ? null : jobs.get(new JobId(job.application(), JobType.productionTestOf(job.type().zone())));
+ readiness = tests != null && ! versions.targetsMatch(tests.get(0).versions) && readiness.okAt(now) ? readiness.blocked() : readiness;
+ toRun.add(new Job(job.type(), versions, readiness, partial));
// Assume first partial change is applied before the second.
- existingPlatform = Optional.of(jobToRun.versions.targetPlatform());
- existingRevision = Optional.of(jobToRun.versions.targetRevision());
+ existingPlatform = Optional.of(versions.targetPlatform());
+ existingRevision = Optional.of(versions.targetRevision());
}
jobs.put(job, toRun);
- });
- return jobs;
+ }
+ Map<JobId, List<Job>> jobsInOrder = new LinkedHashMap<>();
+ for (Entry<JobId, List<Job>> entry : reversed(List.copyOf(jobs.entrySet())))
+ jobsInOrder.put(entry.getKey(), entry.getValue());
+ return jobsInOrder;
}
private boolean areIncompatible(Optional<Version> platform, Optional<RevisionId> revision, JobId job) {
@@ -676,12 +690,15 @@ public class DeploymentStatus {
for (Job productionJob : versionsList)
if (allJobs.successOn(testType, productionJob.versions())
.instance(testJob.application().instance())
- .asList().isEmpty())
+ .asList().isEmpty()) {
+ Readiness readiness = jobSteps().get(testJob).readiness(productionJob.change, Optional.of(job));
testJobs.merge(testJob, List.of(new Job(testJob.type(),
productionJob.versions(),
- jobSteps().get(testJob).readiness(productionJob.change, Optional.of(job)),
+ readiness.okAt(now) && jobs().get(testJob).get().isRunning() ? readiness.running() : readiness,
productionJob.change)),
DeploymentStatus::union);
+
+ }
});
}
}
@@ -1001,9 +1018,9 @@ public class DeploymentStatus {
}
}
if ( ! blocked)
- return current == now ? Readiness.empty : new Readiness(current, DelayCause.blocked);
+ return current == now ? Readiness.empty : new Readiness(current, DelayCause.changeBlocked);
}
- return new Readiness(now.plusSeconds(1 << 30), DelayCause.blocked); // Some time in the future that doesn't look like anything you'd expect.
+ return new Readiness(now.plusSeconds(1 << 30), DelayCause.changeBlocked); // Some time in the future that doesn't look like anything you'd expect.
}
}
@@ -1209,33 +1226,36 @@ public class DeploymentStatus {
}
- public enum DelayCause { none, unverified, notReady, blockedByTest, coolingDown, invalidPackage, blocked, paused }
+ public enum DelayCause { none, unverified, notReady, blocked, running, coolingDown, invalidPackage, changeBlocked, paused }
public record Readiness(Instant at, DelayCause cause) implements Comparable<Readiness> {
public static final Readiness unverified = new Readiness(null, DelayCause.unverified);
public static final Readiness notReady = new Readiness(null, DelayCause.notReady);
public static final Readiness empty = new Readiness(Instant.EPOCH, DelayCause.none);
public Readiness(Instant at) { this(at, DelayCause.none); }
+ public Readiness blocked() { return new Readiness(at, DelayCause.blocked); }
+ public Readiness running() { return new Readiness(at, DelayCause.running); }
public boolean ok() { return at != null; }
- public boolean okAt(Instant at) { return ok() && ! at.isBefore(this.at); }
+ public boolean okAt(Instant at) { return ok() && cause != DelayCause.running && cause != DelayCause.blocked && ! at.isBefore(this.at); }
@Override public int compareTo(Readiness o) {
return at == null ? o.at == null ? 0 : 1
: o.at == null ? -1 : at.compareTo(o.at);
}
@Override public String toString() {
return ok() ? "ready at " + at + switch (cause) {
- case none -> "";
- case blockedByTest -> ": waiting for verification test to complete";
- case coolingDown -> ": cooling down after repeated failures";
- case invalidPackage -> ": invalid application package, must resubmit";
- case blocked -> ": deployment configuration blocks changes";
- case paused -> ": manually paused";
- default -> throw new IllegalStateException(cause + " should not have an instant at which it is ready");
- }
- : "not ready: " + switch (cause) {
- case unverified -> "waiting for verification test to complete";
- case notReady -> "waiting for dependencies to complete";
- default -> throw new IllegalStateException(cause + " should have an instant at which it is ready");
- };
+ case none -> "";
+ case coolingDown -> ": cooling down after repeated failures";
+ case blocked -> ": waiting for verification test to complete";
+ case running -> ": waiting for current run to complete";
+ case invalidPackage -> ": invalid application package, must resubmit";
+ case changeBlocked -> ": deployment configuration blocks changes";
+ case paused -> ": manually paused";
+ default -> throw new IllegalStateException(cause + " should not have an instant at which it is ready");
+ }
+ : "not ready" + switch (cause) {
+ case unverified -> ": waiting for verification test to complete";
+ case notReady -> ": waiting for dependencies to complete";
+ default -> throw new IllegalStateException(cause + " should have an instant at which it is ready");
+ };
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 32d94245ee1..00a0e22f87d 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -20,6 +20,7 @@ import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Change;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId;
+import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatus.DelayCause;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatus.Readiness;
import java.math.BigDecimal;
@@ -40,6 +41,7 @@ import java.util.logging.Logger;
import java.util.stream.Collectors;
import static java.util.Comparator.comparing;
+import static java.util.Comparator.comparingDouble;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toMap;
@@ -374,16 +376,17 @@ public class DeploymentTrigger {
List<Job> jobs = new ArrayList<>();
Map<JobId, List<DeploymentStatus.Job>> jobsToRun = status.jobsToRun();
jobsToRun.forEach((jobId, jobsList) -> {
+ abortIfOutdated(status, jobsToRun, jobId);
DeploymentStatus.Job job = jobsList.get(0);
if ( job.readiness().okAt(clock.instant())
&& ! controller.jobController().isDisabled(new JobId(jobId.application(), job.type()))
- && ! (jobId.type().isProduction() && isUnhealthyInAnotherZone(status.application(), jobId))
- && abortIfRunning(status, jobsToRun, jobId)) // Abort and trigger this later if running with outdated parameters.
+ && ! (jobId.type().isProduction() && isUnhealthyInAnotherZone(status.application(), jobId))) {
jobs.add(deploymentJob(status.application().require(jobId.application().instance()),
job.versions(),
job.type(),
status.instanceJobs(jobId.application().instance()).get(jobId.type()).isNodeAllocationFailure(),
job.readiness().at()));
+ }
});
return Collections.unmodifiableList(jobs);
}
@@ -402,41 +405,29 @@ public class DeploymentTrigger {
return false;
}
- private void abortIfOutdated(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) {
- status.jobs().get(job)
- .flatMap(JobStatus::lastTriggered)
- .filter(last -> ! last.hasEnded() && last.reason().isEmpty())
- .ifPresent(last -> {
- if (jobs.get(job).stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions())
- && versions.versions().sourcesMatchIfPresent(last.versions()))) {
- String blocked = jobs.get(job).stream()
- .map(scheduled -> scheduled.versions().toString())
- .collect(Collectors.joining(", "));
- log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked);
- controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked);
- }
- });
+ private void abortIfOutdated(JobStatus job, List<DeploymentStatus.Job> jobs) {
+ job.lastTriggered()
+ .filter(last -> ! last.hasEnded() && last.reason().isEmpty())
+ .ifPresent(last -> {
+ if (jobs.stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions())
+ && versions.versions().sourcesMatchIfPresent(last.versions()))) {
+ String blocked = jobs.stream()
+ .map(scheduled -> scheduled.versions().toString())
+ .collect(Collectors.joining(", "));
+ log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked);
+ controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked);
+ }
+ });
}
/** Returns whether the job is free to start, and also aborts it if it's running with outdated versions. */
- private boolean abortIfRunning(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) {
- abortIfOutdated(status, jobs, job);
- boolean blocked = status.jobs().get(job).get().isRunning();
-
- if ( ! job.type().isTest()) {
- Optional<JobStatus> productionTest = status.jobs().get(new JobId(job.application(), JobType.productionTestOf(job.type().zone())));
- if (productionTest.isPresent()) {
- abortIfOutdated(status, jobs, productionTest.get().id());
- // Production deployments are also blocked by their declared tests, if the next versions to run
- // for those are not the same as the versions we're considering running in the deployment job now.
- if (productionTest.map(JobStatus::id).map(jobs::get)
- .map(versions -> ! versions.get(0).versions().targetsMatch(jobs.get(job).get(0).versions()))
- .orElse(false))
- blocked = true;
- }
- }
-
- return ! blocked;
+ private void abortIfOutdated(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) {
+ Readiness readiness = jobs.get(job).get(0).readiness();
+ if (readiness.cause() == DelayCause.running)
+ abortIfOutdated(status.jobs().get(job).get(), jobs.get(job));
+ if (readiness.cause() == DelayCause.blocked && ! job.type().isTest())
+ status.jobs().get(new JobId(job.application(), JobType.productionTestOf(job.type().zone())))
+ .ifPresent(jobStatus -> abortIfOutdated(jobStatus, jobs.get(jobStatus.id())));
}
// ---------- Change management o_O ----------
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
index 92632bdcdb1..804ae7b7805 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
@@ -280,12 +280,12 @@ class JobControllerApiHandlerHelper {
switch (readiness.cause()) {
case paused -> stepObject.setLong("pausedUntil", until.toEpochMilli());
case coolingDown -> stepObject.setLong("coolingDownUntil", until.toEpochMilli());
- case blocked -> {
+ case changeBlocked -> {
Readiness platformReadiness = stepStatus.readiness(Change.of(controller.systemVersion(versionStatus))); // Dummy version — just anything with a platform.
- if (platformReadiness.cause() == DelayCause.blocked)
+ if (platformReadiness.cause() == DelayCause.changeBlocked)
stepObject.setLong("platformBlockedUntil", platformReadiness.at().toEpochMilli());
Readiness applicationReadiness = stepStatus.readiness(Change.of(RevisionId.forProduction(1))); // Dummy version — just anything with an application.
- if (applicationReadiness.cause() == DelayCause.blocked)
+ if (applicationReadiness.cause() == DelayCause.changeBlocked)
stepObject.setLong("applicationBlockedUntil", applicationReadiness.at().toEpochMilli());
}
}
@@ -297,8 +297,9 @@ class JobControllerApiHandlerHelper {
case invalidPackage -> "invalidPackage";
case paused -> "paused";
case coolingDown -> "coolingDown";
+ case changeBlocked -> "changeBlocked";
case blocked -> "blocked";
- case blockedByTest -> "blockedByTest";
+ case running -> "running";
case notReady -> "notReady";
case unverified -> "unverified";
});
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
index 37c76fd5f2f..069ee58e9c5 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java
@@ -175,7 +175,7 @@ public class DeploymentApiHandler extends ThreadedHttpRequestHandler {
DeploymentStatus.StepStatus stepStatus = status.instanceSteps().get(instance.instance());
if (stepStatus != null) { // Instance may not have any steps, i.e. an empty deployment spec has been submitted
Readiness platformReadiness = stepStatus.blockedUntil(Change.of(statistics.version()));
- if (platformReadiness.cause() == DelayCause.blocked)
+ if (platformReadiness.cause() == DelayCause.changeBlocked)
instanceObject.setLong("blockedUntil", platformReadiness.at().toEpochMilli());
}
instanceObject.setString("upgradePolicy", toString(status.application().deploymentSpec().instance(instance.instance())
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index 226fb785bf6..afb92d84f3b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -756,8 +756,8 @@ public class DeploymentTriggerTest {
// Last job has a different deployment target, so tests need to run again.
app1.runJob(productionEuWest1) // Upgrade completes, and revision is the only change.
- .runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure.
- .runJob(productionEuWest1); // Finally, west changes revision.
+ .runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure.
+ .runJob(productionEuWest1); // Finally, west changes revision.
assertEquals(Change.empty(), app1.instance().change());
assertEquals(Optional.of(RunStatus.success), app1.instanceJobs().get(productionUsCentral1).lastStatus());
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json
index 37881b19905..6793553faca 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json
@@ -780,7 +780,8 @@
"declared": true,
"instance": "default",
"readyAt": 14403000,
- "delayCause": null,
+ "delayedUntil": 14403000,
+ "delayCause": "running",
"jobName": "production-us-central-1",
"url": "https://some.url:43/instance/default/job/production-us-central-1",
"environment": "prod",
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
index 9a9bc4abf03..ec6ccf3ecf2 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json
@@ -60,7 +60,8 @@
"declared": false,
"instance": "instance1",
"readyAt": 0,
- "delayCause": null,
+ "delayedUntil": 0,
+ "delayCause": "running",
"jobName": "system-test",
"url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/system-test",
"environment": "test",
@@ -189,7 +190,8 @@
"declared": false,
"instance": "instance1",
"readyAt": 0,
- "delayCause": null,
+ "delayedUntil": 0,
+ "delayCause": "running",
"jobName": "staging-test",
"url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/staging-test",
"environment": "staging",