summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2017-09-22 13:52:39 +0200
committerMartin Polden <mpolden@mpolden.no>2017-09-22 13:52:39 +0200
commitf64392e20e25f43e617e8f605a51876444622e20 (patch)
tree2143942e79a1673ab30314316a8f84430b2bc1be
parentf4a65c8f7cb27bd0e449d99e223ba1a0d9135049 (diff)
Always restart deadlocked deployment
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java3
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java69
2 files changed, 68 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
index c3424b8d9af..6c57c9423ff 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
@@ -48,9 +48,6 @@ public class FailureRedeployer extends Maintainer {
private void retryStuckJobs(List<Application> applications) {
Instant maxAge = controller().clock().instant().minus(jobTimeout);
for (Application application : applications) {
- if (!application.deploying().isPresent()) {
- continue;
- }
Optional<JobStatus> job = oldestRunningJob(application);
if (!job.isPresent()) {
continue;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
index b5ee0469e9f..38ddd8a4a1b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java
@@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer;
+import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import org.junit.Test;
import java.nio.file.Files;
@@ -120,13 +121,79 @@ public class FailureRedeployerTest {
tester.failureRedeployer().maintain();
assertEquals(DeploymentJobs.JobType.component.id(), tester.buildSystem().takeJobsToRun().get(0).jobName());
- // Ensure that system-test is trigered after component. Triggering component records a new change, but in this
+ // Ensure that system-test is triggered after component. Triggering component records a new change, but in this
// case there's already a change in progress which we want to discard and start over
tester.notifyJobCompletion(DeploymentJobs.JobType.component, app, true);
assertEquals(DeploymentJobs.JobType.systemTest.id(), tester.buildSystem().jobs().get(0).jobName());
}
@Test
+ public void testAlwaysRestartsDeploymentOfApplicationsWithStuckJobs() {
+ DeploymentTester tester = new DeploymentTester();
+ Version version = Version.fromString("5.0");
+ tester.updateVersionStatus(version);
+
+ ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
+ .environment(Environment.prod)
+ .region("us-west-1")
+ .build();
+
+ // Setup applications
+ Application canary0 = tester.createAndDeploy("canary0", 0, "canary");
+ Application canary1 = tester.createAndDeploy("canary1", 1, "canary");
+ Application default0 = tester.createAndDeploy("default0", 2, "default");
+ Application default1 = tester.createAndDeploy("default1", 3, "default");
+ Application default2 = tester.createAndDeploy("default2", 4, "default");
+ Application default3 = tester.createAndDeploy("default3", 5, "default");
+ Application default4 = tester.createAndDeploy("default4", 6, "default");
+
+ // New version is released
+ version = Version.fromString("5.1");
+ tester.updateVersionStatus(version);
+ assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber());
+ tester.upgrader().maintain();
+
+ // Canaries upgrade and raise confidence
+ tester.completeUpgrade(canary0, version, "canary");
+ tester.completeUpgrade(canary1, version, "canary");
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Applications with default policy start upgrading
+ tester.clock().advance(Duration.ofMinutes(1));
+ tester.upgrader().maintain();
+ assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size());
+
+ // 4/5 applications fail, confidence is lowered and upgrade is cancelled
+ tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence());
+ tester.upgrader().maintain();
+
+ // 5th app never reports back and has a dead locked job, but no ongoing change
+ Application deadLocked = tester.applications().require(default4.id());
+ assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress());
+ assertFalse("No change present", deadLocked.deploying().isPresent());
+
+ // 4/5 applications are repaired and confidence is restored
+ tester.deployCompletely(default0, applicationPackage);
+ tester.deployCompletely(default1, applicationPackage);
+ tester.deployCompletely(default2, applicationPackage);
+ tester.deployCompletely(default3, applicationPackage);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Over 12 hours pass and failure redeployer restarts deployment of 5th app
+ tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
+ tester.failureRedeployer().maintain();
+ assertEquals("Deployment is restarted", DeploymentJobs.JobType.component.id(),
+ tester.buildSystem().jobs().get(0).jobName());
+ }
+
+ @Test
public void testRetriesJobsFailingForCurrentChange() {
DeploymentTester tester = new DeploymentTester();
ApplicationPackage applicationPackage = new ApplicationPackageBuilder()