diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2022-02-15 20:45:53 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-15 20:45:53 +0100 |
commit | 910e2f0bacae0c7ee5e97b2783f57e4023c5c069 (patch) | |
tree | 3b711d99c645bd7073d823d0f3433cc8c3c665d3 | |
parent | 2ebbc783aa190ea40aac166638f01cfc58a00c91 (diff) | |
parent | 1f1de84fdb9b0f5eb7bda44000e4b32799f650b6 (diff) |
Merge pull request #21204 from vespa-engine/jonmv/long-deployment-pipelines-2
Jonmv/long deployment pipelines 2
5 files changed, 133 insertions, 76 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java index 85762cf530a..cc444d50c3e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java @@ -84,7 +84,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL /** Returns the subset of instances which currently have failing jobs on the given version */ public InstanceList failingOn(Version version) { return matching(id -> ! instances.get(id).instanceJobs().get(id).failing() - .not().withStatus(outOfCapacity) + .not().outOfTestCapacity() .lastCompleted().on(version).isEmpty()); } @@ -95,7 +95,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL /** Returns the subset of instances which are not currently failing any jobs. */ public InstanceList failing() { - return matching(id -> ! instances.get(id).instanceJobs().get(id).failing().not().withStatus(outOfCapacity).isEmpty()); + return matching(id -> ! instances.get(id).instanceJobs().get(id).failing().not().outOfTestCapacity().isEmpty()); } /** Returns the subset of instances which are currently failing an upgrade. */ diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index d3271a4abb1..84162729b43 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -392,10 +392,12 @@ public class DeploymentStatus { Optional<Instant> revisionReadyAt = step.dependenciesCompletedAt(change.withoutPlatform(), Optional.of(job)); // If neither change is ready, we guess based on the specified rollout. - if (platformReadyAt.isEmpty() && revisionReadyAt.isEmpty()) switch (rollout) { - case separate: return List.of(change.withoutApplication(), change); // Platform should stay ahead. - case leading: return List.of(change); // They should eventually join. - case simultaneous: return List.of(change.withoutPlatform(), change); // Revision should get ahead. + if (platformReadyAt.isEmpty() && revisionReadyAt.isEmpty()) { + switch (rollout) { + case separate: return List.of(change.withoutApplication(), change); // Platform should stay ahead. + case leading: return List.of(change); // They should eventually join. + case simultaneous: return List.of(change.withoutPlatform(), change); // Revision should get ahead. + } } // If only the revision is ready, we run that first. diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 9af838b1f55..960205526b7 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -15,18 +15,19 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence; import java.time.Duration; -import java.util.Collection; +import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; -import java.util.function.BinaryOperator; -import java.util.function.Function; +import java.util.function.UnaryOperator; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM; -import static java.util.Comparator.naturalOrder; +import static java.util.Comparator.reverseOrder; /** * Maintenance job which schedules applications for Vespa version upgrade @@ -54,91 +55,89 @@ public class Upgrader extends ControllerMaintainer { public double maintain() { // Determine target versions for each upgrade policy VersionStatus versionStatus = controller().readVersionStatus(); - Version canaryTarget = controller().systemVersion(versionStatus); - Collection<Version> defaultTargets = targetVersions(Confidence.normal, versionStatus); - Collection<Version> conservativeTargets = targetVersions(Confidence.high, versionStatus); + cancelBrokenUpgrades(versionStatus); - cancelUpgrades(versionStatus, canaryTarget, defaultTargets, conservativeTargets); - upgrade(versionStatus, canaryTarget, defaultTargets, conservativeTargets); - return 1.0; - } + Optional<Integer> targetMajorVersion = targetMajorVersion(); + InstanceList instances = instances(controller().systemVersion(versionStatus)); + for (UpgradePolicy policy : UpgradePolicy.values()) + updateTargets(versionStatus, instances, policy, targetMajorVersion); - /** Returns the target versions for given confidence, one per major version in the system */ - private Collection<Version> targetVersions(Confidence confidence, VersionStatus versionStatus) { - return versionStatus.versions().stream() - // Ensure we never pick a version newer than the system - .filter(v -> !v.versionNumber().isAfter(controller().systemVersion(versionStatus))) - .filter(v -> v.confidence().equalOrHigherThan(confidence)) - .map(VespaVersion::versionNumber) - .collect(Collectors.toMap(Version::getMajor, // Key on major version - Function.identity(), // Use version as value - BinaryOperator.<Version>maxBy(naturalOrder()))) // Pick highest version when merging versions within this major - .values(); + return 1.0; } /** Returns a list of all production application instances, except those which are pinned, which we should not manipulate here. */ private InstanceList instances(Version systemVersion) { return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()), systemVersion)) .withDeclaredJobs() + .shuffle(random) + .byIncreasingDeployedVersion() .unpinned(); } - private void cancelUpgrades(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) { - InstanceList instances = instances(controller().systemVersion(versionStatus)); + private void cancelBrokenUpgrades(VersionStatus versionStatus) { // Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation) + InstanceList instances = instances(controller().systemVersion(versionStatus)); for (VespaVersion version : versionStatus.versions()) { if (version.confidence() == Confidence.broken) - cancelUpgradesOf(instances.upgradingTo(version.versionNumber()) - .not().with(UpgradePolicy.canary), + cancelUpgradesOf(instances.upgradingTo(version.versionNumber()).not().with(UpgradePolicy.canary), version.versionNumber() + " is broken"); } + } - // Canaries should always try the canary target - cancelUpgradesOf(instances.upgrading() - .not().upgradingTo(canaryTarget) - .with(UpgradePolicy.canary), - "Outdated target version for Canaries"); - - // Cancel *failed* upgrades to earlier versions, as the new version may fix it - String reason = "Failing on outdated version"; - cancelUpgradesOf(instances.upgrading() - .failing() - .not().upgradingTo(defaultTargets) - .with(UpgradePolicy.defaultPolicy), - reason); - cancelUpgradesOf(instances.upgrading() - .failing() - .not().upgradingTo(conservativeTargets) - .with(UpgradePolicy.conservative), - reason); - } - - private void upgrade(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) { - InstanceList instances = instances(controller().systemVersion(versionStatus)); - Optional<Integer> targetMajorVersion = targetMajorVersion(); - upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size()); - defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade())); - conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade())); + private void updateTargets(VersionStatus versionStatus, InstanceList instances, UpgradePolicy policy, Optional<Integer> targetMajorVersion) { + InstanceList remaining = instances.with(policy); + List<Version> targetAndNewer = new ArrayList<>(); + UnaryOperator<InstanceList> cancellationCriterion = policy == UpgradePolicy.canary ? i -> i.not().upgradingTo(targetAndNewer) + : i -> i.failing() + .not().upgradingTo(targetAndNewer); + + Map<ApplicationId, Version> targets = new LinkedHashMap<>(); + for (Version version : targetsForConfidence(versionStatus, policy)) { + targetAndNewer.add(version); + InstanceList eligible = eligibleForVersion(remaining, version, cancellationCriterion, targetMajorVersion); + InstanceList outdated = cancellationCriterion.apply(eligible); + cancelUpgradesOf(outdated.upgrading(), "Upgrading to outdated versions"); + + remaining = remaining.not().matching(eligible.asList()::contains); // Prefer the newest target for each instance. + for (ApplicationId id : outdated.and(eligible.not().deploying())) + targets.put(id, version); + } + + int numberToUpgrade = policy == UpgradePolicy.canary ? instances.size() : numberOfApplicationsToUpgrade(); + for (ApplicationId id : instances.matching(targets.keySet()::contains).first(numberToUpgrade)) { + log.log(Level.INFO, "Triggering upgrade to " + targets.get(id) + " for " + id); + controller().applications().deploymentTrigger().triggerChange(id, Change.of(targets.get(id))); + } + } + + /** Returns target versions for given confidence, by descending version number. */ + private List<Version> targetsForConfidence(VersionStatus versions, UpgradePolicy policy) { + Version systemVersion = controller().systemVersion(versions); + if (policy == UpgradePolicy.canary) + return List.of(systemVersion); + + Confidence target = policy == UpgradePolicy.defaultPolicy ? Confidence.normal : Confidence.high; + return versions.versions().stream() + .filter(version -> ! version.versionNumber().isAfter(systemVersion) + && version.confidence().equalOrHigherThan(target)) + .map(VespaVersion::versionNumber) + .sorted(reverseOrder()) + .collect(Collectors.toList()); } - private void upgrade(InstanceList instances, Version version, Optional<Integer> targetMajorVersion, int numberToUpgrade) { + private InstanceList eligibleForVersion(InstanceList instances, Version version, UnaryOperator<InstanceList> cancellationCriterion, Optional<Integer> targetMajorVersion) { Change change = Change.of(version); - instances.not().failingOn(version) - .allowMajorVersion(version.getMajor(), targetMajorVersion.orElse(version.getMajor())) - .not().deploying() - .not().hasCompleted(change) // Avoid rescheduling change for instances without production steps - .onLowerVersionThan(version) - .canUpgradeAt(version, controller().clock().instant()) - .shuffle(random) // Shuffle so we do not always upgrade instances in the same order - .byIncreasingDeployedVersion() - .first(numberToUpgrade) - .forEach(instance -> controller().applications().deploymentTrigger().triggerChange(instance, change)); + return instances.not().failingOn(version) + .allowMajorVersion(version.getMajor(), targetMajorVersion.orElse(version.getMajor())) + .not().hasCompleted(change) // Avoid rescheduling change for instances without production steps. + .onLowerVersionThan(version) + .canUpgradeAt(version, controller().clock().instant()); } private void cancelUpgradesOf(InstanceList instances, String reason) { instances = instances.unpinned(); if (instances.isEmpty()) return; - log.info("Cancelling upgrading of " + instances.asList().size() + " instances: " + reason); + log.info("Cancelling upgrading of " + instances.asList() + " instances: " + reason); for (ApplicationId instance : instances.asList()) controller().applications().deploymentTrigger().cancelChange(instance, PLATFORM); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index fb34d57ba0d..9f108be9650 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -1263,9 +1263,6 @@ public class DeploymentTriggerTest { assertEquals(Change.empty(), app3.instance().change()); } - // TODO test multi-tier pipeline with various policies - // TODO test multi-tier pipeline with upgrade after new version is the candidate - @Test public void testRevisionJoinsUpgradeWithSeparateRollout() { var appPackage = new ApplicationPackageBuilder().region("us-central-1") @@ -1422,6 +1419,63 @@ public class DeploymentTriggerTest { } @Test + public void testsVeryLengthyPipeline() { + String lengthyDeploymentSpec = + "<deployment version='1.0'>\n" + + " <instance id='alpha'>\n" + + " <test />\n" + + " <staging />\n" + + " <upgrade rollout='simultaneous' />\n" + + " <prod>\n" + + " <region>us-east-3</region>\n" + + " <test>us-east-3</test>\n" + + " </prod>\n" + + " </instance>\n" + + " <instance id='beta'>\n" + + " <upgrade rollout='simultaneous' />\n" + + " <prod>\n" + + " <region>us-east-3</region>\n" + + " <test>us-east-3</test>\n" + + " </prod>\n" + + " </instance>\n" + + " <instance id='gamma'>\n" + + " <upgrade rollout='separate' />\n" + + " <prod>\n" + + " <region>us-east-3</region>\n" + + " <test>us-east-3</test>\n" + + " </prod>\n" + + " </instance>\n" + + "</deployment>\n"; + var appPackage = ApplicationPackageBuilder.fromDeploymentXml(lengthyDeploymentSpec); + var alpha = tester.newDeploymentContext("t", "a", "alpha"); + var beta = tester.newDeploymentContext("t", "a", "beta"); + var gamma = tester.newDeploymentContext("t", "a", "gamma"); + alpha.submit(appPackage).deploy(); + + // A version releases, but when the first upgrade has gotten through alpha, beta, and gamma, a newer version has high confidence. + var version0 = tester.controller().readSystemVersion(); + var version1 = new Version("7.1"); + var version2 = new Version("7.2"); + tester.controllerTester().upgradeSystem(version1); + + tester.upgrader().maintain(); + alpha.runJob(systemTest).runJob(stagingTest) + .runJob(productionUsEast3).runJob(testUsEast3); + assertEquals(Change.empty(), alpha.instance().change()); + + tester.upgrader().maintain(); + beta.runJob(productionUsEast3); + tester.controllerTester().upgradeSystem(version2); + beta.runJob(testUsEast3); + assertEquals(Change.empty(), beta.instance().change()); + + tester.upgrader().maintain(); + assertEquals(Change.of(version2), alpha.instance().change()); + assertEquals(Change.empty(), beta.instance().change()); + assertEquals(Change.of(version1), gamma.instance().change()); + } + + @Test public void testRevisionJoinsUpgradeWithLeadingRollout() { var appPackage = new ApplicationPackageBuilder().region("us-central-1") .region("us-east-3") diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 0e61ffd3ea6..265dedec8d0 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -35,6 +35,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.deployment.JobTy import static com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType.systemTest; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.ALL; import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PIN; +import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -533,7 +534,7 @@ public class UpgraderTest { } @Test - public void testBlockVersionChangeHalfwayThoughThenNewRevision() { + public void testBlockVersionChangeHalfwayThroughThenNewRevision() { // Friday, 16:00 tester.at(Instant.parse("2017-09-29T16:00:00.00Z")); @@ -541,7 +542,7 @@ public class UpgraderTest { tester.controllerTester().upgradeSystem(version); ApplicationPackage applicationPackage = new ApplicationPackageBuilder() - // Block upgrades on weekends and ouside working hours + // Block upgrades on weekends and outside working hours .blockChange(false, true, "mon-fri", "00-09,17-23", "UTC") .blockChange(false, true, "sat-sun", "00-23", "UTC") .region("us-west-1") @@ -577,11 +578,12 @@ public class UpgraderTest { app.runJob(stagingTest).triggerJobs().jobAborted(productionUsCentral1); // Retry will include revision. tester.triggerJobs(); // Triggers us-central-1 before platform upgrade is cancelled. - // A new version is also released, cancelling the upgrade, since it is failing on a now outdated version. + // A new version is also released, and someone cancels the upgrade, suspecting it is faulty. tester.clock().advance(Duration.ofHours(17)); version = Version.fromString("6.4"); tester.controllerTester().upgradeSystem(version); tester.upgrader().maintain(); + tester.deploymentTrigger().cancelChange(app.instanceId(), PLATFORM); // us-central-1 succeeds upgrade to 6.3, with the revision, but us-east-3 wants to proceed with only the revision change. app.runJob(productionUsCentral1); |