summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2017-09-29 11:38:50 +0200
committerMartin Polden <mpolden@mpolden.no>2017-09-29 14:21:33 +0200
commitdfe49809cd3d23c9d7e059ddbf6aee4f763942ab (patch)
tree91d2aae1ab72aea01b750ce1cf4d1538b88249c4
parentb9dab0a9e5e75bb585a9cb73696c816d1a19d27f (diff)
Reschedule upgrades that time out
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java13
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java6
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java66
3 files changed, 78 insertions, 7 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
index fd3fa867c99..ab1b5aa3cd4 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java
@@ -127,9 +127,9 @@ public class ApplicationList {
return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion))));
}
- /** Returns the subset of applications which currently do not have any job in progress for the given change */
- public ApplicationList notRunningJobFor(Change.VersionChange change) {
- return listOf(list.stream().filter(a -> !hasRunningJob(a, change)));
+ /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */
+ public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) {
+ return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod)));
}
// ----------------------------------- Internal helpers
@@ -157,11 +157,12 @@ public class ApplicationList {
return false;
}
- private static boolean hasRunningJob(Application application, Change.VersionChange change) {
+ private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) {
return application.deploymentJobs().jobStatus().values().stream()
.filter(JobStatus::inProgress)
- .filter(jobStatus -> jobStatus.lastTriggered().isPresent())
- .map(jobStatus -> jobStatus.lastTriggered().get())
+ .filter(status -> status.lastTriggered().isPresent())
+ .map(status -> status.lastTriggered().get())
+ .filter(jobRun -> jobRun.at().isAfter(instant))
.anyMatch(jobRun -> jobRun.version().equals(change.version()));
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 54ef5df8ac0..fd742ab983a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -11,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
+import java.time.Instant;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -21,6 +22,8 @@ import java.util.logging.Logger;
*/
public class Upgrader extends Maintainer {
+ private static final Duration upgradeTimeout = Duration.ofHours(12);
+
private static final Logger log = Logger.getLogger(Upgrader.class.getName());
public Upgrader(Controller controller, Duration interval, JobControl jobControl) {
@@ -62,12 +65,13 @@ public class Upgrader extends Maintainer {
private void upgrade(ApplicationList applications, Version version) {
Change.VersionChange change = new Change.VersionChange(version);
+ Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout);
cancelUpgradesOf(applications.upgradingToLowerThan(version));
applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade
applications = applications.onLowerVersionThan(version);
applications = applications.notDeployingApplication(); // wait with applications deploying an application change
applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version
- applications = applications.notRunningJobFor(change); // do not trigger multiple jobs simultaneously for same upgrade
+ applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading
applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades
for (Application application : applications.byIncreasingDeployedVersion().asList()) {
try {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
index ccdfc8d042e..8584a62383b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
@@ -400,4 +400,70 @@ public class UpgraderTest {
assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty());
}
+ @Test
+ public void testReschedulesUpgradeAfterTimeout() {
+ DeploymentTester tester = new DeploymentTester();
+ Version version = Version.fromString("5.0");
+ tester.updateVersionStatus(version);
+
+ ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
+ .environment(Environment.prod)
+ .region("us-west-1")
+ .build();
+
+ // Setup applications
+ Application canary0 = tester.createAndDeploy("canary0", 0, "canary");
+ Application canary1 = tester.createAndDeploy("canary1", 1, "canary");
+ Application default0 = tester.createAndDeploy("default0", 2, "default");
+ Application default1 = tester.createAndDeploy("default1", 3, "default");
+ Application default2 = tester.createAndDeploy("default2", 4, "default");
+ Application default3 = tester.createAndDeploy("default3", 5, "default");
+ Application default4 = tester.createAndDeploy("default4", 6, "default");
+
+ // New version is released
+ version = Version.fromString("5.1");
+ tester.updateVersionStatus(version);
+ assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber());
+ tester.upgrader().maintain();
+
+ // Canaries upgrade and raise confidence
+ tester.completeUpgrade(canary0, version, "canary");
+ tester.completeUpgrade(canary1, version, "canary");
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Applications with default policy start upgrading
+ tester.clock().advance(Duration.ofMinutes(1));
+ tester.upgrader().maintain();
+ assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size());
+
+ // 4/5 applications fail, confidence is lowered and upgrade is cancelled
+ tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence());
+ tester.upgrader().maintain();
+
+ // 5th app never reports back and has a dead locked job, but no ongoing change
+ Application deadLocked = tester.applications().require(default4.id());
+ assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress());
+ assertFalse("No change present", deadLocked.deploying().isPresent());
+
+ // 4/5 applications are repaired and confidence is restored
+ tester.deployCompletely(default0, applicationPackage);
+ tester.deployCompletely(default1, applicationPackage);
+ tester.deployCompletely(default2, applicationPackage);
+ tester.deployCompletely(default3, applicationPackage);
+ tester.updateVersionStatus(version);
+ assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence());
+
+ // Over 12 hours pass and upgrade is rescheduled for 5th app
+ tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1)));
+ tester.upgrader().maintain();
+ assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(),
+ tester.buildSystem().jobs().get(0).jobName());
+ }
+
}