diff options
author | jonmv <venstad@gmail.com> | 2022-07-01 17:19:39 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2022-07-01 17:19:39 +0200 |
commit | 0c18fdd48a7202fba90126de776c7fda86a15c44 (patch) | |
tree | bdf59e6c05752df064f8d53fd6ba07f0ce5509b9 /controller-server | |
parent | bc61be2efe931f7d4b5357b2792c001d3266517b (diff) |
Set broken revisions aside, so upgrades may be attempted
Diffstat (limited to 'controller-server')
4 files changed, 56 insertions, 2 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index 24cd92d005f..2b764abb090 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -518,6 +518,7 @@ public class DeploymentStatus { boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get()); boolean failingUpgradeOnlyTests = ! jobs().type(systemTest(job.type()), stagingTest(job.type())) .failingHardOn(Versions.from(change.withoutApplication(), application, deploymentFor(job), systemVersion)) + .matching(testJob -> ! testJob.isSuccess()) .isEmpty(); switch (rollout) { case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing. diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index c28f94bc4d7..87e86d98da9 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -448,6 +448,10 @@ public class DeploymentTrigger { private boolean acceptNewRevision(DeploymentStatus status, InstanceName instance, RevisionId revision) { if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance. + if ( ! status.jobs().failingApplicationChange() + .firstFailing().endedNoLaterThan(clock.instant().minus(Duration.ofDays(5))) + .firstFailing().on(revision) + .isEmpty()) return false; // Don't deploy a broken revision. boolean isChangingRevision = status.application().require(instance).change().revision().isPresent(); DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance); Predicate<RevisionId> revisionFilter = spec.revisionTarget() == DeploymentSpec.RevisionTarget.next diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index b45e7b046d6..4f84c86350b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -114,7 +114,7 @@ public class Upgrader extends ControllerMaintainer { // Prefer the newest target for each instance. remaining = remaining.not().matching(eligible.asList()::contains) .not().hasCompleted(Change.of(version)); - for (ApplicationId id : outdated.and(eligible.not().upgrading()).not().changingRevision()) + for (ApplicationId id : outdated.and(eligible.not().upgrading())) targets.put(id, version); } @@ -123,6 +123,7 @@ public class Upgrader extends ControllerMaintainer { log.log(Level.INFO, "Triggering upgrade to " + targets.get(id) + " for " + id); if (failingRevision.contains(id)) controller().applications().deploymentTrigger().cancelChange(id, ChangesToCancel.APPLICATION); + controller().applications().deploymentTrigger().triggerChange(id, Change.of(targets.get(id))); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 3b21d3a017e..6fd2a8a6858 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -35,6 +35,7 @@ import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.pro import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.stagingTest; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.systemTest; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.ALL; +import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.APPLICATION; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PIN; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM; import static org.junit.Assert.assertEquals; @@ -995,6 +996,53 @@ public class UpgraderTest { } @Test + public void testSettingFailingRevisionAside() { + DeploymentContext app = tester.newDeploymentContext().submit().deploy(); + + // New revision fails. + app.submit(); + Optional<RevisionId> revision1 = app.lastSubmission(); + app.failDeployment(systemTest); + + // New version is not targeted. + Version version1 = new Version("7"); + tester.controllerTester().upgradeSystem(version1); + assertEquals(Change.of(revision1.get()), app.instance().change()); + + tester.upgrader().run(); + assertEquals(Change.of(revision1.get()), app.instance().change()); + + // Broken revision is cancelled, and new version targeted, after some time. + tester.clock().advance(Duration.ofDays(6)); + tester.upgrader().run(); + assertEquals(Change.of(version1), app.instance().change()); + + // Broken revision is not targeted again. + app.triggerJobs(); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + assertEquals(Change.of(version1), app.instance().change()); + + app.failDeployment(systemTest); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + assertEquals(Change.of(version1), app.instance().change()); + + // Revision gets a second change when upgrade fixes the failing job. + tester.clock().advance(Duration.ofDays(12)); // Time for retries. + app.runJob(systemTest).jobAborted(stagingTest).runJob(stagingTest); + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + + assertEquals(Change.of(version1).with(revision1.get()), app.instance().change()); + app.runJob(productionUsCentral1); // Upgrade rolls. + app.runJob(systemTest).runJob(stagingTest).runJob(productionUsCentral1); // Revision rolls. + app.runJob(productionUsEast3).runJob(productionUsWest1); // Upgrade completes. + app.runJob(productionUsEast3).runJob(productionUsWest1); // Revision completes. + assertEquals(Change.empty(), app.instance().change()); + } + + @Test public void testsEachUpgradeCombinationWithFailingDeployments() { Version v1 = Version.fromString("6.1"); tester.controllerTester().upgradeSystem(v1); @@ -1085,7 +1133,7 @@ public class UpgraderTest { default2.instanceId(), default2); // Throttle upgrades per run - ((ManualClock) tester.controller().clock()).setInstant(Instant.ofEpochMilli(1589787109000L)); // Fixed random seed + ((ManualClock) tester.controller().clock()).setInstant(Instant.ofEpochMilli(1589787107000L)); // Fixed random seed Upgrader upgrader = new Upgrader(tester.controller(), Duration.ofMinutes(10)); upgrader.setUpgradesPerMinute(0.1); |