diff options
3 files changed, 78 insertions, 7 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index fd3fa867c99..ab1b5aa3cd4 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -127,9 +127,9 @@ public class ApplicationList { return listOf(list.stream().sorted(Comparator.comparing(application -> application.deployedVersion().orElse(Version.emptyVersion)))); } - /** Returns the subset of applications which currently do not have any job in progress for the given change */ - public ApplicationList notRunningJobFor(Change.VersionChange change) { - return listOf(list.stream().filter(a -> !hasRunningJob(a, change))); + /** Returns the subset of applications that are not upgrading or started upgrading before the grace period */ + public ApplicationList notCurrentlyUpgrading(Change.VersionChange change, Instant startOfGracePeriod) { + return listOf(list.stream().filter(a -> !currentlyUpgrading(change, a, startOfGracePeriod))); } // ----------------------------------- Internal helpers @@ -157,11 +157,12 @@ public class ApplicationList { return false; } - private static boolean hasRunningJob(Application application, Change.VersionChange change) { + private static boolean currentlyUpgrading(Change.VersionChange change, Application application, Instant instant) { return application.deploymentJobs().jobStatus().values().stream() .filter(JobStatus::inProgress) - .filter(jobStatus -> jobStatus.lastTriggered().isPresent()) - .map(jobStatus -> jobStatus.lastTriggered().get()) + .filter(status -> status.lastTriggered().isPresent()) + .map(status -> status.lastTriggered().get()) + .filter(jobRun -> jobRun.at().isAfter(instant)) .anyMatch(jobRun -> jobRun.version().equals(change.version())); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 54ef5df8ac0..fd742ab983a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -11,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.yolean.Exceptions; import java.time.Duration; +import java.time.Instant; import java.util.logging.Level; import java.util.logging.Logger; @@ -21,6 +22,8 @@ import java.util.logging.Logger; */ public class Upgrader extends Maintainer { + private static final Duration upgradeTimeout = Duration.ofHours(12); + private static final Logger log = Logger.getLogger(Upgrader.class.getName()); public Upgrader(Controller controller, Duration interval, JobControl jobControl) { @@ -62,12 +65,13 @@ public class Upgrader extends Maintainer { private void upgrade(ApplicationList applications, Version version) { Change.VersionChange change = new Change.VersionChange(version); + Instant startOfUpgradePeriod = controller().clock().instant().minus(upgradeTimeout); cancelUpgradesOf(applications.upgradingToLowerThan(version)); applications = applications.notPullRequest(); // Pull requests are deployed as separate applications to test then deleted; No need to upgrade applications = applications.onLowerVersionThan(version); applications = applications.notDeployingApplication(); // wait with applications deploying an application change applications = applications.notFailingOn(version); // try to upgrade only if it hasn't failed on this version - applications = applications.notRunningJobFor(change); // do not trigger multiple jobs simultaneously for same upgrade + applications = applications.notCurrentlyUpgrading(change, startOfUpgradePeriod); // do not trigger again if currently upgrading applications = applications.canUpgradeAt(controller().clock().instant()); // wait with applications that are currently blocking upgrades for (Application application : applications.byIncreasingDeployedVersion().asList()) { try { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index ccdfc8d042e..8584a62383b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -400,4 +400,70 @@ public class UpgraderTest { assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); } + @Test + public void testReschedulesUpgradeAfterTimeout() { + DeploymentTester tester = new DeploymentTester(); + Version version = Version.fromString("5.0"); + tester.updateVersionStatus(version); + + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + // Setup applications + Application canary0 = tester.createAndDeploy("canary0", 0, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 1, "canary"); + Application default0 = tester.createAndDeploy("default0", 2, "default"); + Application default1 = tester.createAndDeploy("default1", 3, "default"); + Application default2 = tester.createAndDeploy("default2", 4, "default"); + Application default3 = tester.createAndDeploy("default3", 5, "default"); + Application default4 = tester.createAndDeploy("default4", 6, "default"); + + // New version is released + version = Version.fromString("5.1"); + tester.updateVersionStatus(version); + assertEquals(version, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // Canaries upgrade and raise confidence + tester.completeUpgrade(canary0, version, "canary"); + tester.completeUpgrade(canary1, version, "canary"); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Applications with default policy start upgrading + tester.clock().advance(Duration.ofMinutes(1)); + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); + + // 4/5 applications fail, confidence is lowered and upgrade is cancelled + tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + tester.upgrader().maintain(); + + // 5th app never reports back and has a dead locked job, but no ongoing change + Application deadLocked = tester.applications().require(default4.id()); + assertTrue("Jobs in progress", deadLocked.deploymentJobs().inProgress()); + assertFalse("No change present", deadLocked.deploying().isPresent()); + + // 4/5 applications are repaired and confidence is restored + tester.deployCompletely(default0, applicationPackage); + tester.deployCompletely(default1, applicationPackage); + tester.deployCompletely(default2, applicationPackage); + tester.deployCompletely(default3, applicationPackage); + tester.updateVersionStatus(version); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Over 12 hours pass and upgrade is rescheduled for 5th app + tester.clock().advance(Duration.ofHours(12).plus(Duration.ofSeconds(1))); + tester.upgrader().maintain(); + assertEquals("Upgrade is rescheduled", DeploymentJobs.JobType.systemTest.id(), + tester.buildSystem().jobs().get(0).jobName()); + } + } |