diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2022-02-02 11:27:58 +0100 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2022-02-02 11:27:58 +0100 |
commit | 91dee3f7000822f39769025798cc3cbdb35843a7 (patch) | |
tree | 47b767cfac2380486a51f466d1e1dbf6515e40cb /controller-server | |
parent | 34ce6dda4876d24680986451e79c14fa919708c7 (diff) |
Special-case OOC only in test zones, and catch all dependent jobs for a revision change
Diffstat (limited to 'controller-server')
3 files changed, 29 insertions, 33 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index 88a2d0b0a6b..7f7449fb38b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -118,7 +118,7 @@ public class DeploymentStatus { return ! allJobs.matching(job -> criticalJobs.contains(job.id())) .failing() - .not().withStatus(RunStatus.outOfCapacity) + .not().outOfTestCapacity() .isEmpty(); } @@ -126,39 +126,29 @@ public class DeploymentStatus { if (visited.contains(current)) return dependents.contains(current); - if (dependency == current) { + if (dependency == current) dependents.add(current); - visited.add(current); - return true; - } + else + for (StepStatus dep : current.dependencies) + if (fillDependents(dependency, visited, dependents, dep)) + dependents.add(current); - for (StepStatus dep : current.dependencies) - if (fillDependents(dependency, visited, dependents, dep)) - dependents.add(current); visited.add(current); - return dependents.contains(current); } - /** Whether any jobs of the given instance, or in an instance dependent on it, are failing with other errors than lack of capacity in a test zone. */ - public boolean hasFailures(InstanceName instance) { - Set<StepStatus> dependents = new HashSet<>(); - for (StepStatus instanceStep : instanceSteps().values()) - fillDependents(instanceSteps().get(instance), new HashSet<>(), dependents, instanceStep); - - Set<InstanceName> dependentInstances = dependents.stream() - .map(StepStatus::instance) - .collect(Collectors.toSet()); - return ! allJobs.instance(dependentInstances) - .failing() - .not().withStatus(RunStatus.outOfCapacity) + /** Whether any job is failing on anything older than version, with errors other than lack of capacity in a test zone.. */ + public boolean hasFailures(ApplicationVersion version) { + return ! allJobs.failing() + .not().outOfTestCapacity() + .matching(job -> job.lastTriggered().get().versions().targetApplication().compareTo(version) < 0) .isEmpty(); } /** Whether any jobs of this application are failing with other errors than lack of capacity in a test zone. */ public boolean hasFailures() { return ! allJobs.failing() - .not().withStatus(RunStatus.outOfCapacity) + .not().outOfTestCapacity() .isEmpty(); } @@ -380,12 +370,15 @@ public class DeploymentStatus { boolean platformReadyFirst = platformReadyAt.get().isBefore(revisionReadyAt.get()); boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get()); switch (application.deploymentSpec().requireInstance(job.application().instance()).upgradeRollout()) { - case separate: // Let whichever change rolled out first, keep rolling first, unless jobs are failing. - return hasFailures() ? List.of(change) - : platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH) ? List.of(change.withoutApplication(), change) // No jobs have run yet, assume platform came first - : revisionReadyFirst ? List.of(change.withoutPlatform(), change) - : List.of(change); - case leading: // When one change catches up, they fuse and continue together. + case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing. + return (platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH)) // Assume platform was first if no jobs have run yet. + ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() + ? List.of(change) // Platform was first, but is failing. + : List.of(change.withoutApplication(), change) // Platform was first, and is OK. + : revisionReadyFirst + ? List.of(change.withoutPlatform(), change) // Revision was first. + : List.of(change); // Both ready at the same time, probably due to earlier failure. + case leading: // When one change catches up, they fuse and continue together. return List.of(change); case simultaneous: // Revisions are allowed to run ahead, but the job where it caught up should have both changes. return platformReadyFirst ? List.of(change) : List.of(change.withoutPlatform(), change); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 6da06acb6e5..b944593bbab 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -97,7 +97,7 @@ public class DeploymentTrigger { && status.instanceSteps().get(instanceName) .readyAt(outstanding) .map(readyAt -> ! readyAt.isAfter(clock.instant())).orElse(false) - && acceptNewApplicationVersion(status, instanceName)) { + && acceptNewApplicationVersion(status, instanceName, outstanding.application().get())) { application = application.with(instanceName, instance -> withRemainingChange(instance, instance.change().with(outstanding.application().get()), status)); } @@ -386,13 +386,12 @@ public class DeploymentTrigger { // ---------- Change management o_O ---------- - private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) { + private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance, ApplicationVersion version) { if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance. - if (status.hasFailures(instance)) return true; // Allow changes to fix upgrade or previous revision problems. + if (status.hasFailures(version)) return true; // Allow changes to fix upgrade or previous revision problems. DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance); Change change = status.application().require(instance).change(); - if (change.application().isPresent() && spec.upgradeRevision() == DeploymentSpec.UpgradeRevision.separate) return false; - return true; + return change.application().isEmpty() || spec.upgradeRevision() != DeploymentSpec.UpgradeRevision.separate; } private Instance withRemainingChange(Instance instance, Change change, DeploymentStatus status) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index ab6816d3646..fe9c6e6f3d6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -52,6 +52,10 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return matching(job -> job.lastCompleted().isPresent() && ! job.isSuccess()); } + public JobList outOfTestCapacity() { + return matching(job -> job.isOutOfCapacity() && job.id().type().environment().isTest()); + } + public JobList running() { return matching(job -> job.isRunning()); } |