diff options
author | jonmv <venstad@gmail.com> | 2022-07-04 10:16:56 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2022-07-04 10:16:56 +0200 |
commit | 3be71ffbfe3f2a3c147fafa4fb27d72e600743fe (patch) | |
tree | 8251e310ff0f63ddfa9968984db6b5e83c764713 /controller-server | |
parent | 57056557daa3ad2c0fe6163d64ca03a3249e18d1 (diff) |
Reset failure duration on new revisions, for cancellation by upgrader
Diffstat (limited to 'controller-server')
4 files changed, 25 insertions, 11 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java index 22df5ca559e..4a00a272c75 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; +import com.yahoo.vespa.hosted.controller.application.Change; import java.time.Instant; import java.util.Collection; @@ -36,8 +37,10 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus /** Returns the subset of applications which have been failing an application change since the given instant */ public DeploymentStatusList failingApplicationChangeSince(Instant threshold) { - return matching(status -> status.instanceJobs().values().stream() - .anyMatch(jobs -> failingApplicationChangeSince(jobs, threshold))); + return matching(status -> status.instanceJobs().entrySet().stream() + .anyMatch(jobs -> failingApplicationChangeSince(jobs.getValue(), + status.application().require(jobs.getKey().instance()).change(), + threshold))); } private static boolean failingUpgradeToVersionSince(JobList jobs, Version version, Instant threshold) { @@ -47,10 +50,8 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus .isEmpty(); } - private static boolean failingApplicationChangeSince(JobList jobs, Instant threshold) { - return ! jobs.failingApplicationChange() - .firstFailing().endedNoLaterThan(threshold) - .isEmpty(); + private static boolean failingApplicationChangeSince(JobList jobs, Change change, Instant threshold) { + return change.revision().map(revision -> ! jobs.failingWithBrokenRevisionSince(revision, threshold).isEmpty()).orElse(false); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 11f8e201a84..7ceeda08d3a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -449,9 +449,7 @@ public class DeploymentTrigger { private boolean acceptNewRevision(DeploymentStatus status, InstanceName instance, RevisionId revision) { if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance. - if ( ! status.jobs().failingApplicationChange() - .firstFailing().endedNoLaterThan(clock.instant().minus(maxFailingRevisionTime)) - .firstFailing().on(revision) + if ( ! status.jobs().failingWithBrokenRevisionSince(revision, clock.instant().minus(maxFailingRevisionTime)) .isEmpty()) return false; // Don't deploy a broken revision. boolean isChangingRevision = status.application().require(instance).change().revision().isPresent(); DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 551f841233e..3074c9ac3ba 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; import com.yahoo.config.provision.InstanceName; -import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId; @@ -74,6 +73,14 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> { return matching(JobList::failingApplicationChange); } + /** Returns the subset of jobs which are failing because of an application change, and have been since the threshold, on the given revision. */ + public JobList failingWithBrokenRevisionSince(RevisionId broken, Instant threshold) { + return failingApplicationChange().matching(job -> job.runs().values().stream() + .anyMatch(run -> run.versions().targetRevision().equals(broken) + && run.hasFailed() + && run.start().isBefore(threshold))); + } + /** Returns the subset of jobs which are failing with the given run status. */ public JobList withStatus(RunStatus status) { return matching(job -> job.lastStatus().map(status::equals).orElse(false)); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 5e3bf2832d8..5968b490c09 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -1013,6 +1013,14 @@ public class UpgraderTest { tester.upgrader().run(); assertEquals(Change.of(revision1.get()), app.instance().change()); + // Broken revision is replaced by a new attempt, which also fails, and cancellation is not yet triggered. + tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); + app.submit(); + Optional<RevisionId> revision2 = app.lastSubmission(); + app.failDeployment(systemTest); + tester.upgrader().run(); + assertEquals(Change.of(revision2.get()), app.instance().change()); + // Broken revision is cancelled, and new version targeted, after some time. tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); tester.upgrader().run(); @@ -1035,7 +1043,7 @@ public class UpgraderTest { tester.upgrader().run(); tester.outstandingChangeDeployer().run(); - assertEquals(Change.of(version1).with(revision1.get()), app.instance().change()); + assertEquals(Change.of(version1).with(revision2.get()), app.instance().change()); app.runJob(systemTest).runJob(stagingTest).runJob(productionUsCentral1); // Revision rolls. app.runJob(productionUsEast3).runJob(productionUsWest1); // Upgrade completes. app.runJob(productionUsEast3).runJob(productionUsWest1); // Revision completes. |