diff options
author | Martin Polden <mpolden@mpolden.no> | 2017-09-22 13:52:39 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2017-09-22 13:52:39 +0200 |
commit | f64392e20e25f43e617e8f605a51876444622e20 (patch) | |
tree | 2143942e79a1673ab30314316a8f84430b2bc1be /controller-server/src | |
parent | f4a65c8f7cb27bd0e449d99e223ba1a0d9135049 (diff) |
Always restart deadlocked deployment
Diffstat (limited to 'controller-server/src')
2 files changed, 68 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index c3424b8d9af..6c57c9423ff 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -48,9 +48,6 @@ public class FailureRedeployer extends Maintainer { private void retryStuckJobs(List<Application> applications) { Instant maxAge = controller().clock().instant().minus(jobTimeout); for (Application application : applications) { - if (!application.deploying().isPresent()) { - continue; - } Optional<JobStatus> job = oldestRunningJob(application); if (!job.isPresent()) { continue; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index b5ee0469e9f..38ddd8a4a1b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.nio.file.Files; @@ -120,13 +121,79 @@ public class FailureRedeployerTest { tester.failureRedeployer().maintain(); assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName()); - // Ensure that system-test is trigered after component. Triggering component records a new change, but in this + // Ensure that system-test is triggered after component. Triggering component records a new change, but in this // case there's already a change in progress which we want to discard and start over tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true); assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName()); } @Test + public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() { + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("5.0"); + tester.updateVersionStatus(version); + + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + // Setup applications + Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); + Application default0 = tester.createAndDeploy("default0", 2, "default"); + Application default1 = tester.createAndDeploy("default1", 3, "default"); + Application default2 = tester.createAndDeploy("default2", 4, "default"); + Application default3 = tester.createAndDeploy("default3", 5, "default"); + Application default4 = tester.createAndDeploy("default4", 6, "default"); + + // New version is released + version = Version.fromString("5.1"); + tester.updateVersionStatus(version); + assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // Canaries upgrade and raise confidence + tester.completeUpgrade(canary0, version, "canary"); + tester.completeUpgrade(canary1, version, "canary"); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Applications with default policy start upgrading + tester.clock().advance(Duration.ofMinutes(1)); + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); + + // 4/5 applications fail, confidence is lowered and upgrade is cancelled + tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + tester.upgrader().maintain(); + + // 5th app never reports back and has a dead locked job, but no ongoing change + Application deadLocked = tester.applications().require(default4.id()); + assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); + assertFalse("No change present", deadLocked.deploying().isPresent()); + + // 4/5 applications are repaired and confidence is restored + tester.deployCompletely(default0, applicationPackage); + tester.deployCompletely(default1, applicationPackage); + tester.deployCompletely(default2, applicationPackage); + tester.deployCompletely(default3, applicationPackage); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Over 12 hours pass and failure redeployer restarts deployment of 5th app + tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); + tester.failureRedeployer().maintain(); + assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(), + tester.buildSystem().jobs().get(0).jobName()); + } + + @Test public void testRetriesJobsFailingForCurrentChange() { DeploymentTester tester = new DeploymentTester(); ApplicationPackage applicationPackage = new ApplicationPackageBuilder() |