diff options
author | jonmv <venstad@gmail.com> | 2023-04-13 11:22:29 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-04-13 11:26:21 +0200 |
commit | 78c76f976497d31ede3aa7f4f9cf0bd04d8963c1 (patch) | |
tree | 63e791e70afc2276826d09e1115f9171765983a6 /controller-server | |
parent | 20cf71d92f35c2464d760ad56fde480c95b7d5f1 (diff) |
Compute whether jobs are blocked in the right place, and expose
Diffstat (limited to 'controller-server')
7 files changed, 94 insertions, 79 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index cd1e354135a..00da34fe2e4 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -43,6 +43,7 @@ import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -302,7 +303,11 @@ public class DeploymentStatus { fallbackPlatform(change, job)); if (step.completedAt(change, firstProductionJobWithDeploymentInCloud).isEmpty()) { JobType typeWithZone = job.type().isSystemTest() ? JobType.systemTest(zones, cloud) : JobType.stagingTest(zones, cloud); - jobs.merge(job, List.of(new Job(typeWithZone, versions, step.readiness(change, firstProductionJobWithDeploymentInCloud), change)), DeploymentStatus::union); + Readiness readiness = step.readiness(change, firstProductionJobWithDeploymentInCloud); + jobs.merge(job, List.of(new Job(typeWithZone, + versions, + readiness.okAt(now) && jobs().get(job).get().isRunning() ? readiness.running() : readiness, + change)), DeploymentStatus::union); } }); }); @@ -529,13 +534,15 @@ public class DeploymentStatus { private Map<JobId, List<Job>> productionJobs(InstanceName instance, Change change, boolean assumeUpgradesSucceed) { Map<JobId, List<Job>> jobs = new LinkedHashMap<>(); - jobSteps.forEach((job, step) -> { + for (Entry<JobId, StepStatus> entry : reversed(List.copyOf(jobSteps.entrySet()))) { + JobId job = entry.getKey(); + StepStatus step = entry.getValue(); if ( ! job.application().instance().equals(instance) || ! job.type().isProduction()) - return; + continue; // Signal strict completion criterion by depending on job itself. if (step.completedAt(change, Optional.of(job)).isPresent()) - return; + continue; // When computing eager test jobs for outstanding changes, assume current change completes successfully. Optional<Deployment> deployment = deploymentFor(job); @@ -545,7 +552,7 @@ public class DeploymentStatus { || areIncompatible(change.platform(), existingRevision, job); if (assumeUpgradesSucceed) { if (deployingCompatibilityChange) // No eager tests for this. - return; + continue; Change currentChange = application.require(instance).change(); Versions target = Versions.from(currentChange, application, deployment, fallbackPlatform(currentChange, job)); @@ -555,21 +562,28 @@ public class DeploymentStatus { List<Job> toRun = new ArrayList<>(); List<Change> changes = deployingCompatibilityChange || allJobs.get(job).flatMap(status -> status.lastCompleted()).isEmpty() - ? List.of(change) - : changes(job, step, change); + ? List.of(change) + : changes(job, step, change); for (Change partial : changes) { - Job jobToRun = new Job(job.type(), - Versions.from(partial, application, existingPlatform, existingRevision, fallbackPlatform(partial, job)), - step.readiness(partial, Optional.of(job)), - partial); - toRun.add(jobToRun); + Versions versions = Versions.from(partial, application, existingPlatform, existingRevision, fallbackPlatform(partial, job)); + Readiness readiness = step.readiness(partial, Optional.of(job)); + // This job is blocked if it is already running ... + readiness = jobs().get(job).get().isRunning() && readiness.okAt(now) ? readiness.running() : readiness; + // ... or if it is a deployment, and a test job for the current state is not yet complete, + // which is the case when the next versions to run that test with is not the same as we want to deploy here. + List<Job> tests = job.type().isTest() ? null : jobs.get(new JobId(job.application(), JobType.productionTestOf(job.type().zone()))); + readiness = tests != null && ! versions.targetsMatch(tests.get(0).versions) && readiness.okAt(now) ? readiness.blocked() : readiness; + toRun.add(new Job(job.type(), versions, readiness, partial)); // Assume first partial change is applied before the second. - existingPlatform = Optional.of(jobToRun.versions.targetPlatform()); - existingRevision = Optional.of(jobToRun.versions.targetRevision()); + existingPlatform = Optional.of(versions.targetPlatform()); + existingRevision = Optional.of(versions.targetRevision()); } jobs.put(job, toRun); - }); - return jobs; + } + Map<JobId, List<Job>> jobsInOrder = new LinkedHashMap<>(); + for (Entry<JobId, List<Job>> entry : reversed(List.copyOf(jobs.entrySet()))) + jobsInOrder.put(entry.getKey(), entry.getValue()); + return jobsInOrder; } private boolean areIncompatible(Optional<Version> platform, Optional<RevisionId> revision, JobId job) { @@ -676,12 +690,15 @@ public class DeploymentStatus { for (Job productionJob : versionsList) if (allJobs.successOn(testType, productionJob.versions()) .instance(testJob.application().instance()) - .asList().isEmpty()) + .asList().isEmpty()) { + Readiness readiness = jobSteps().get(testJob).readiness(productionJob.change, Optional.of(job)); testJobs.merge(testJob, List.of(new Job(testJob.type(), productionJob.versions(), - jobSteps().get(testJob).readiness(productionJob.change, Optional.of(job)), + readiness.okAt(now) && jobs().get(testJob).get().isRunning() ? readiness.running() : readiness, productionJob.change)), DeploymentStatus::union); + + } }); } } @@ -1001,9 +1018,9 @@ public class DeploymentStatus { } } if ( ! blocked) - return current == now ? Readiness.empty : new Readiness(current, DelayCause.blocked); + return current == now ? Readiness.empty : new Readiness(current, DelayCause.changeBlocked); } - return new Readiness(now.plusSeconds(1 << 30), DelayCause.blocked); // Some time in the future that doesn't look like anything you'd expect. + return new Readiness(now.plusSeconds(1 << 30), DelayCause.changeBlocked); // Some time in the future that doesn't look like anything you'd expect. } } @@ -1209,33 +1226,36 @@ public class DeploymentStatus { } - public enum DelayCause { none, unverified, notReady, blockedByTest, coolingDown, invalidPackage, blocked, paused } + public enum DelayCause { none, unverified, notReady, blocked, running, coolingDown, invalidPackage, changeBlocked, paused } public record Readiness(Instant at, DelayCause cause) implements Comparable<Readiness> { public static final Readiness unverified = new Readiness(null, DelayCause.unverified); public static final Readiness notReady = new Readiness(null, DelayCause.notReady); public static final Readiness empty = new Readiness(Instant.EPOCH, DelayCause.none); public Readiness(Instant at) { this(at, DelayCause.none); } + public Readiness blocked() { return new Readiness(at, DelayCause.blocked); } + public Readiness running() { return new Readiness(at, DelayCause.running); } public boolean ok() { return at != null; } - public boolean okAt(Instant at) { return ok() && ! at.isBefore(this.at); } + public boolean okAt(Instant at) { return ok() && cause != DelayCause.running && cause != DelayCause.blocked && ! at.isBefore(this.at); } @Override public int compareTo(Readiness o) { return at == null ? o.at == null ? 0 : 1 : o.at == null ? -1 : at.compareTo(o.at); } @Override public String toString() { return ok() ? "ready at " + at + switch (cause) { - case none -> ""; - case blockedByTest -> ": waiting for verification test to complete"; - case coolingDown -> ": cooling down after repeated failures"; - case invalidPackage -> ": invalid application package, must resubmit"; - case blocked -> ": deployment configuration blocks changes"; - case paused -> ": manually paused"; - default -> throw new IllegalStateException(cause + " should not have an instant at which it is ready"); - } - : "not ready: " + switch (cause) { - case unverified -> "waiting for verification test to complete"; - case notReady -> "waiting for dependencies to complete"; - default -> throw new IllegalStateException(cause + " should have an instant at which it is ready"); - }; + case none -> ""; + case coolingDown -> ": cooling down after repeated failures"; + case blocked -> ": waiting for verification test to complete"; + case running -> ": waiting for current run to complete"; + case invalidPackage -> ": invalid application package, must resubmit"; + case changeBlocked -> ": deployment configuration blocks changes"; + case paused -> ": manually paused"; + default -> throw new IllegalStateException(cause + " should not have an instant at which it is ready"); + } + : "not ready" + switch (cause) { + case unverified -> ": waiting for verification test to complete"; + case notReady -> ": waiting for dependencies to complete"; + default -> throw new IllegalStateException(cause + " should have an instant at which it is ready"); + }; } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 32d94245ee1..00a0e22f87d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -20,6 +20,7 @@ import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatus.DelayCause; import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatus.Readiness; import java.math.BigDecimal; @@ -40,6 +41,7 @@ import java.util.logging.Logger; import java.util.stream.Collectors; import static java.util.Comparator.comparing; +import static java.util.Comparator.comparingDouble; import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.toMap; @@ -374,16 +376,17 @@ public class DeploymentTrigger { List<Job> jobs = new ArrayList<>(); Map<JobId, List<DeploymentStatus.Job>> jobsToRun = status.jobsToRun(); jobsToRun.forEach((jobId, jobsList) -> { + abortIfOutdated(status, jobsToRun, jobId); DeploymentStatus.Job job = jobsList.get(0); if ( job.readiness().okAt(clock.instant()) && ! controller.jobController().isDisabled(new JobId(jobId.application(), job.type())) - && ! (jobId.type().isProduction() && isUnhealthyInAnotherZone(status.application(), jobId)) - && abortIfRunning(status, jobsToRun, jobId)) // Abort and trigger this later if running with outdated parameters. + && ! (jobId.type().isProduction() && isUnhealthyInAnotherZone(status.application(), jobId))) { jobs.add(deploymentJob(status.application().require(jobId.application().instance()), job.versions(), job.type(), status.instanceJobs(jobId.application().instance()).get(jobId.type()).isNodeAllocationFailure(), job.readiness().at())); + } }); return Collections.unmodifiableList(jobs); } @@ -402,41 +405,29 @@ public class DeploymentTrigger { return false; } - private void abortIfOutdated(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) { - status.jobs().get(job) - .flatMap(JobStatus::lastTriggered) - .filter(last -> ! last.hasEnded() && last.reason().isEmpty()) - .ifPresent(last -> { - if (jobs.get(job).stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions()) - && versions.versions().sourcesMatchIfPresent(last.versions()))) { - String blocked = jobs.get(job).stream() - .map(scheduled -> scheduled.versions().toString()) - .collect(Collectors.joining(", ")); - log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked); - controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked); - } - }); + private void abortIfOutdated(JobStatus job, List<DeploymentStatus.Job> jobs) { + job.lastTriggered() + .filter(last -> ! last.hasEnded() && last.reason().isEmpty()) + .ifPresent(last -> { + if (jobs.stream().noneMatch(versions -> versions.versions().targetsMatch(last.versions()) + && versions.versions().sourcesMatchIfPresent(last.versions()))) { + String blocked = jobs.stream() + .map(scheduled -> scheduled.versions().toString()) + .collect(Collectors.joining(", ")); + log.log(Level.INFO, "Aborting outdated run " + last + ", which is blocking runs: " + blocked); + controller.jobController().abort(last.id(), "run no longer scheduled, and is blocking scheduled runs: " + blocked); + } + }); } /** Returns whether the job is free to start, and also aborts it if it's running with outdated versions. */ - private boolean abortIfRunning(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) { - abortIfOutdated(status, jobs, job); - boolean blocked = status.jobs().get(job).get().isRunning(); - - if ( ! job.type().isTest()) { - Optional<JobStatus> productionTest = status.jobs().get(new JobId(job.application(), JobType.productionTestOf(job.type().zone()))); - if (productionTest.isPresent()) { - abortIfOutdated(status, jobs, productionTest.get().id()); - // Production deployments are also blocked by their declared tests, if the next versions to run - // for those are not the same as the versions we're considering running in the deployment job now. - if (productionTest.map(JobStatus::id).map(jobs::get) - .map(versions -> ! versions.get(0).versions().targetsMatch(jobs.get(job).get(0).versions())) - .orElse(false)) - blocked = true; - } - } - - return ! blocked; + private void abortIfOutdated(DeploymentStatus status, Map<JobId, List<DeploymentStatus.Job>> jobs, JobId job) { + Readiness readiness = jobs.get(job).get(0).readiness(); + if (readiness.cause() == DelayCause.running) + abortIfOutdated(status.jobs().get(job).get(), jobs.get(job)); + if (readiness.cause() == DelayCause.blocked && ! job.type().isTest()) + status.jobs().get(new JobId(job.application(), JobType.productionTestOf(job.type().zone()))) + .ifPresent(jobStatus -> abortIfOutdated(jobStatus, jobs.get(jobStatus.id()))); } // ---------- Change management o_O ---------- diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java index 92632bdcdb1..804ae7b7805 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java @@ -280,12 +280,12 @@ class JobControllerApiHandlerHelper { switch (readiness.cause()) { case paused -> stepObject.setLong("pausedUntil", until.toEpochMilli()); case coolingDown -> stepObject.setLong("coolingDownUntil", until.toEpochMilli()); - case blocked -> { + case changeBlocked -> { Readiness platformReadiness = stepStatus.readiness(Change.of(controller.systemVersion(versionStatus))); // Dummy version — just anything with a platform. - if (platformReadiness.cause() == DelayCause.blocked) + if (platformReadiness.cause() == DelayCause.changeBlocked) stepObject.setLong("platformBlockedUntil", platformReadiness.at().toEpochMilli()); Readiness applicationReadiness = stepStatus.readiness(Change.of(RevisionId.forProduction(1))); // Dummy version — just anything with an application. - if (applicationReadiness.cause() == DelayCause.blocked) + if (applicationReadiness.cause() == DelayCause.changeBlocked) stepObject.setLong("applicationBlockedUntil", applicationReadiness.at().toEpochMilli()); } } @@ -297,8 +297,9 @@ class JobControllerApiHandlerHelper { case invalidPackage -> "invalidPackage"; case paused -> "paused"; case coolingDown -> "coolingDown"; + case changeBlocked -> "changeBlocked"; case blocked -> "blocked"; - case blockedByTest -> "blockedByTest"; + case running -> "running"; case notReady -> "notReady"; case unverified -> "unverified"; }); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java index 37c76fd5f2f..069ee58e9c5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/deployment/DeploymentApiHandler.java @@ -175,7 +175,7 @@ public class DeploymentApiHandler extends ThreadedHttpRequestHandler { DeploymentStatus.StepStatus stepStatus = status.instanceSteps().get(instance.instance()); if (stepStatus != null) { // Instance may not have any steps, i.e. an empty deployment spec has been submitted Readiness platformReadiness = stepStatus.blockedUntil(Change.of(statistics.version())); - if (platformReadiness.cause() == DelayCause.blocked) + if (platformReadiness.cause() == DelayCause.changeBlocked) instanceObject.setLong("blockedUntil", platformReadiness.at().toEpochMilli()); } instanceObject.setString("upgradePolicy", toString(status.application().deploymentSpec().instance(instance.instance()) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 226fb785bf6..afb92d84f3b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -756,8 +756,8 @@ public class DeploymentTriggerTest { // Last job has a different deployment target, so tests need to run again. app1.runJob(productionEuWest1) // Upgrade completes, and revision is the only change. - .runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure. - .runJob(productionEuWest1); // Finally, west changes revision. + .runJob(productionUsCentral1) // With only revision change, central should run to cover a previous failure. + .runJob(productionEuWest1); // Finally, west changes revision. assertEquals(Change.empty(), app1.instance().change()); assertEquals(Optional.of(RunStatus.success), app1.instanceJobs().get(productionUsCentral1).lastStatus()); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json index 37881b19905..6793553faca 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview-2.json @@ -780,7 +780,8 @@ "declared": true, "instance": "default", "readyAt": 14403000, - "delayCause": null, + "delayedUntil": 14403000, + "delayCause": "running", "jobName": "production-us-central-1", "url": "https://some.url:43/instance/default/job/production-us-central-1", "environment": "prod", diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json index 9a9bc4abf03..ec6ccf3ecf2 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/deployment-overview.json @@ -60,7 +60,8 @@ "declared": false, "instance": "instance1", "readyAt": 0, - "delayCause": null, + "delayedUntil": 0, + "delayCause": "running", "jobName": "system-test", "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/system-test", "environment": "test", @@ -189,7 +190,8 @@ "declared": false, "instance": "instance1", "readyAt": 0, - "delayCause": null, + "delayedUntil": 0, + "delayCause": "running", "jobName": "staging-test", "url": "http://localhost:8080/application/v4/tenant/tenant1/application/application1/instance/instance1/job/staging-test", "environment": "staging", |