From 3be71ffbfe3f2a3c147fafa4fb27d72e600743fe Mon Sep 17 00:00:00 2001 From: jonmv Date: Mon, 4 Jul 2022 10:16:56 +0200 Subject: Reset failure duration on new revisions, for cancellation by upgrader --- .../hosted/controller/deployment/DeploymentStatusList.java | 13 +++++++------ .../hosted/controller/deployment/DeploymentTrigger.java | 4 +--- .../yahoo/vespa/hosted/controller/deployment/JobList.java | 9 ++++++++- .../vespa/hosted/controller/maintenance/UpgraderTest.java | 10 +++++++++- 4 files changed, 25 insertions(+), 11 deletions(-) (limited to 'controller-server') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java index 22df5ca559e..4a00a272c75 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; +import com.yahoo.vespa.hosted.controller.application.Change; import java.time.Instant; import java.util.Collection; @@ -36,8 +37,10 @@ public class DeploymentStatusList extends AbstractFilteringList status.instanceJobs().values().stream() - .anyMatch(jobs -> failingApplicationChangeSince(jobs, threshold))); + return matching(status -> status.instanceJobs().entrySet().stream() + .anyMatch(jobs -> failingApplicationChangeSince(jobs.getValue(), + status.application().require(jobs.getKey().instance()).change(), + threshold))); } private static boolean failingUpgradeToVersionSince(JobList jobs, Version version, Instant threshold) { @@ -47,10 +50,8 @@ public class DeploymentStatusList extends AbstractFilteringList ! jobs.failingWithBrokenRevisionSince(revision, threshold).isEmpty()).orElse(false); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 11f8e201a84..7ceeda08d3a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -449,9 +449,7 @@ public class DeploymentTrigger { private boolean acceptNewRevision(DeploymentStatus status, InstanceName instance, RevisionId revision) { if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance. - if ( ! status.jobs().failingApplicationChange() - .firstFailing().endedNoLaterThan(clock.instant().minus(maxFailingRevisionTime)) - .firstFailing().on(revision) + if ( ! status.jobs().failingWithBrokenRevisionSince(revision, clock.instant().minus(maxFailingRevisionTime)) .isEmpty()) return false; // Don't deploy a broken revision. boolean isChangingRevision = status.application().require(instance).change().revision().isPresent(); DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 551f841233e..3074c9ac3ba 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; import com.yahoo.component.Version; import com.yahoo.config.provision.InstanceName; -import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId; @@ -74,6 +73,14 @@ public class JobList extends AbstractFilteringList { return matching(JobList::failingApplicationChange); } + /** Returns the subset of jobs which are failing because of an application change, and have been since the threshold, on the given revision. */ + public JobList failingWithBrokenRevisionSince(RevisionId broken, Instant threshold) { + return failingApplicationChange().matching(job -> job.runs().values().stream() + .anyMatch(run -> run.versions().targetRevision().equals(broken) + && run.hasFailed() + && run.start().isBefore(threshold))); + } + /** Returns the subset of jobs which are failing with the given run status. */ public JobList withStatus(RunStatus status) { return matching(job -> job.lastStatus().map(status::equals).orElse(false)); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 5e3bf2832d8..5968b490c09 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -1013,6 +1013,14 @@ public class UpgraderTest { tester.upgrader().run(); assertEquals(Change.of(revision1.get()), app.instance().change()); + // Broken revision is replaced by a new attempt, which also fails, and cancellation is not yet triggered. + tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); + app.submit(); + Optional revision2 = app.lastSubmission(); + app.failDeployment(systemTest); + tester.upgrader().run(); + assertEquals(Change.of(revision2.get()), app.instance().change()); + // Broken revision is cancelled, and new version targeted, after some time. tester.clock().advance(DeploymentTrigger.maxFailingRevisionTime.plusSeconds(1)); tester.upgrader().run(); @@ -1035,7 +1043,7 @@ public class UpgraderTest { tester.upgrader().run(); tester.outstandingChangeDeployer().run(); - assertEquals(Change.of(version1).with(revision1.get()), app.instance().change()); + assertEquals(Change.of(version1).with(revision2.get()), app.instance().change()); app.runJob(systemTest).runJob(stagingTest).runJob(productionUsCentral1); // Revision rolls. app.runJob(productionUsEast3).runJob(productionUsWest1); // Upgrade completes. app.runJob(productionUsEast3).runJob(productionUsWest1); // Revision completes. -- cgit v1.2.3