aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2022-02-15 20:45:53 +0100
committerGitHub <noreply@github.com>2022-02-15 20:45:53 +0100
commit910e2f0bacae0c7ee5e97b2783f57e4023c5c069 (patch)
tree3b711d99c645bd7073d823d0f3433cc8c3c665d3
parent2ebbc783aa190ea40aac166638f01cfc58a00c91 (diff)
parent1f1de84fdb9b0f5eb7bda44000e4b32799f650b6 (diff)
Merge pull request #21204 from vespa-engine/jonmv/long-deployment-pipelines-2
Jonmv/long deployment pipelines 2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java4
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java10
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java127
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java60
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java8
5 files changed, 133 insertions, 76 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java
index 85762cf530a..cc444d50c3e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/InstanceList.java
@@ -84,7 +84,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL
/** Returns the subset of instances which currently have failing jobs on the given version */
public InstanceList failingOn(Version version) {
return matching(id -> ! instances.get(id).instanceJobs().get(id).failing()
- .not().withStatus(outOfCapacity)
+ .not().outOfTestCapacity()
.lastCompleted().on(version).isEmpty());
}
@@ -95,7 +95,7 @@ public class InstanceList extends AbstractFilteringList<ApplicationId, InstanceL
/** Returns the subset of instances which are not currently failing any jobs. */
public InstanceList failing() {
- return matching(id -> ! instances.get(id).instanceJobs().get(id).failing().not().withStatus(outOfCapacity).isEmpty());
+ return matching(id -> ! instances.get(id).instanceJobs().get(id).failing().not().outOfTestCapacity().isEmpty());
}
/** Returns the subset of instances which are currently failing an upgrade. */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index d3271a4abb1..84162729b43 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -392,10 +392,12 @@ public class DeploymentStatus {
Optional<Instant> revisionReadyAt = step.dependenciesCompletedAt(change.withoutPlatform(), Optional.of(job));
// If neither change is ready, we guess based on the specified rollout.
- if (platformReadyAt.isEmpty() && revisionReadyAt.isEmpty()) switch (rollout) {
- case separate: return List.of(change.withoutApplication(), change); // Platform should stay ahead.
- case leading: return List.of(change); // They should eventually join.
- case simultaneous: return List.of(change.withoutPlatform(), change); // Revision should get ahead.
+ if (platformReadyAt.isEmpty() && revisionReadyAt.isEmpty()) {
+ switch (rollout) {
+ case separate: return List.of(change.withoutApplication(), change); // Platform should stay ahead.
+ case leading: return List.of(change); // They should eventually join.
+ case simultaneous: return List.of(change.withoutPlatform(), change); // Revision should get ahead.
+ }
}
// If only the revision is ready, we run that first.
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 9af838b1f55..960205526b7 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -15,18 +15,19 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence;
import java.time.Duration;
-import java.util.Collection;
+import java.util.ArrayList;
import java.util.LinkedHashMap;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Random;
-import java.util.function.BinaryOperator;
-import java.util.function.Function;
+import java.util.function.UnaryOperator;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM;
-import static java.util.Comparator.naturalOrder;
+import static java.util.Comparator.reverseOrder;
/**
* Maintenance job which schedules applications for Vespa version upgrade
@@ -54,91 +55,89 @@ public class Upgrader extends ControllerMaintainer {
public double maintain() {
// Determine target versions for each upgrade policy
VersionStatus versionStatus = controller().readVersionStatus();
- Version canaryTarget = controller().systemVersion(versionStatus);
- Collection<Version> defaultTargets = targetVersions(Confidence.normal, versionStatus);
- Collection<Version> conservativeTargets = targetVersions(Confidence.high, versionStatus);
+ cancelBrokenUpgrades(versionStatus);
- cancelUpgrades(versionStatus, canaryTarget, defaultTargets, conservativeTargets);
- upgrade(versionStatus, canaryTarget, defaultTargets, conservativeTargets);
- return 1.0;
- }
+ Optional<Integer> targetMajorVersion = targetMajorVersion();
+ InstanceList instances = instances(controller().systemVersion(versionStatus));
+ for (UpgradePolicy policy : UpgradePolicy.values())
+ updateTargets(versionStatus, instances, policy, targetMajorVersion);
- /** Returns the target versions for given confidence, one per major version in the system */
- private Collection<Version> targetVersions(Confidence confidence, VersionStatus versionStatus) {
- return versionStatus.versions().stream()
- // Ensure we never pick a version newer than the system
- .filter(v -> !v.versionNumber().isAfter(controller().systemVersion(versionStatus)))
- .filter(v -> v.confidence().equalOrHigherThan(confidence))
- .map(VespaVersion::versionNumber)
- .collect(Collectors.toMap(Version::getMajor, // Key on major version
- Function.identity(), // Use version as value
- BinaryOperator.<Version>maxBy(naturalOrder()))) // Pick highest version when merging versions within this major
- .values();
+ return 1.0;
}
/** Returns a list of all production application instances, except those which are pinned, which we should not manipulate here. */
private InstanceList instances(Version systemVersion) {
return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()), systemVersion))
.withDeclaredJobs()
+ .shuffle(random)
+ .byIncreasingDeployedVersion()
.unpinned();
}
- private void cancelUpgrades(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) {
- InstanceList instances = instances(controller().systemVersion(versionStatus));
+ private void cancelBrokenUpgrades(VersionStatus versionStatus) {
// Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation)
+ InstanceList instances = instances(controller().systemVersion(versionStatus));
for (VespaVersion version : versionStatus.versions()) {
if (version.confidence() == Confidence.broken)
- cancelUpgradesOf(instances.upgradingTo(version.versionNumber())
- .not().with(UpgradePolicy.canary),
+ cancelUpgradesOf(instances.upgradingTo(version.versionNumber()).not().with(UpgradePolicy.canary),
version.versionNumber() + " is broken");
}
+ }
- // Canaries should always try the canary target
- cancelUpgradesOf(instances.upgrading()
- .not().upgradingTo(canaryTarget)
- .with(UpgradePolicy.canary),
- "Outdated target version for Canaries");
-
- // Cancel *failed* upgrades to earlier versions, as the new version may fix it
- String reason = "Failing on outdated version";
- cancelUpgradesOf(instances.upgrading()
- .failing()
- .not().upgradingTo(defaultTargets)
- .with(UpgradePolicy.defaultPolicy),
- reason);
- cancelUpgradesOf(instances.upgrading()
- .failing()
- .not().upgradingTo(conservativeTargets)
- .with(UpgradePolicy.conservative),
- reason);
- }
-
- private void upgrade(VersionStatus versionStatus, Version canaryTarget, Collection<Version> defaultTargets, Collection<Version> conservativeTargets) {
- InstanceList instances = instances(controller().systemVersion(versionStatus));
- Optional<Integer> targetMajorVersion = targetMajorVersion();
- upgrade(instances.with(UpgradePolicy.canary), canaryTarget, targetMajorVersion, instances.size());
- defaultTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.defaultPolicy), target, targetMajorVersion, numberOfApplicationsToUpgrade()));
- conservativeTargets.forEach(target -> upgrade(instances.with(UpgradePolicy.conservative), target, targetMajorVersion, numberOfApplicationsToUpgrade()));
+ private void updateTargets(VersionStatus versionStatus, InstanceList instances, UpgradePolicy policy, Optional<Integer> targetMajorVersion) {
+ InstanceList remaining = instances.with(policy);
+ List<Version> targetAndNewer = new ArrayList<>();
+ UnaryOperator<InstanceList> cancellationCriterion = policy == UpgradePolicy.canary ? i -> i.not().upgradingTo(targetAndNewer)
+ : i -> i.failing()
+ .not().upgradingTo(targetAndNewer);
+
+ Map<ApplicationId, Version> targets = new LinkedHashMap<>();
+ for (Version version : targetsForConfidence(versionStatus, policy)) {
+ targetAndNewer.add(version);
+ InstanceList eligible = eligibleForVersion(remaining, version, cancellationCriterion, targetMajorVersion);
+ InstanceList outdated = cancellationCriterion.apply(eligible);
+ cancelUpgradesOf(outdated.upgrading(), "Upgrading to outdated versions");
+
+ remaining = remaining.not().matching(eligible.asList()::contains); // Prefer the newest target for each instance.
+ for (ApplicationId id : outdated.and(eligible.not().deploying()))
+ targets.put(id, version);
+ }
+
+ int numberToUpgrade = policy == UpgradePolicy.canary ? instances.size() : numberOfApplicationsToUpgrade();
+ for (ApplicationId id : instances.matching(targets.keySet()::contains).first(numberToUpgrade)) {
+ log.log(Level.INFO, "Triggering upgrade to " + targets.get(id) + " for " + id);
+ controller().applications().deploymentTrigger().triggerChange(id, Change.of(targets.get(id)));
+ }
+ }
+
+ /** Returns target versions for given confidence, by descending version number. */
+ private List<Version> targetsForConfidence(VersionStatus versions, UpgradePolicy policy) {
+ Version systemVersion = controller().systemVersion(versions);
+ if (policy == UpgradePolicy.canary)
+ return List.of(systemVersion);
+
+ Confidence target = policy == UpgradePolicy.defaultPolicy ? Confidence.normal : Confidence.high;
+ return versions.versions().stream()
+ .filter(version -> ! version.versionNumber().isAfter(systemVersion)
+ && version.confidence().equalOrHigherThan(target))
+ .map(VespaVersion::versionNumber)
+ .sorted(reverseOrder())
+ .collect(Collectors.toList());
}
- private void upgrade(InstanceList instances, Version version, Optional<Integer> targetMajorVersion, int numberToUpgrade) {
+ private InstanceList eligibleForVersion(InstanceList instances, Version version, UnaryOperator<InstanceList> cancellationCriterion, Optional<Integer> targetMajorVersion) {
Change change = Change.of(version);
- instances.not().failingOn(version)
- .allowMajorVersion(version.getMajor(), targetMajorVersion.orElse(version.getMajor()))
- .not().deploying()
- .not().hasCompleted(change) // Avoid rescheduling change for instances without production steps
- .onLowerVersionThan(version)
- .canUpgradeAt(version, controller().clock().instant())
- .shuffle(random) // Shuffle so we do not always upgrade instances in the same order
- .byIncreasingDeployedVersion()
- .first(numberToUpgrade)
- .forEach(instance -> controller().applications().deploymentTrigger().triggerChange(instance, change));
+ return instances.not().failingOn(version)
+ .allowMajorVersion(version.getMajor(), targetMajorVersion.orElse(version.getMajor()))
+ .not().hasCompleted(change) // Avoid rescheduling change for instances without production steps.
+ .onLowerVersionThan(version)
+ .canUpgradeAt(version, controller().clock().instant());
}
private void cancelUpgradesOf(InstanceList instances, String reason) {
instances = instances.unpinned();
if (instances.isEmpty()) return;
- log.info("Cancelling upgrading of " + instances.asList().size() + " instances: " + reason);
+ log.info("Cancelling upgrading of " + instances.asList() + " instances: " + reason);
for (ApplicationId instance : instances.asList())
controller().applications().deploymentTrigger().cancelChange(instance, PLATFORM);
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index fb34d57ba0d..9f108be9650 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -1263,9 +1263,6 @@ public class DeploymentTriggerTest {
assertEquals(Change.empty(), app3.instance().change());
}
- // TODO test multi-tier pipeline with various policies
- // TODO test multi-tier pipeline with upgrade after new version is the candidate
-
@Test
public void testRevisionJoinsUpgradeWithSeparateRollout() {
var appPackage = new ApplicationPackageBuilder().region("us-central-1")
@@ -1422,6 +1419,63 @@ public class DeploymentTriggerTest {
}
@Test
+ public void testsVeryLengthyPipeline() {
+ String lengthyDeploymentSpec =
+ "<deployment version='1.0'>\n" +
+ " <instance id='alpha'>\n" +
+ " <test />\n" +
+ " <staging />\n" +
+ " <upgrade rollout='simultaneous' />\n" +
+ " <prod>\n" +
+ " <region>us-east-3</region>\n" +
+ " <test>us-east-3</test>\n" +
+ " </prod>\n" +
+ " </instance>\n" +
+ " <instance id='beta'>\n" +
+ " <upgrade rollout='simultaneous' />\n" +
+ " <prod>\n" +
+ " <region>us-east-3</region>\n" +
+ " <test>us-east-3</test>\n" +
+ " </prod>\n" +
+ " </instance>\n" +
+ " <instance id='gamma'>\n" +
+ " <upgrade rollout='separate' />\n" +
+ " <prod>\n" +
+ " <region>us-east-3</region>\n" +
+ " <test>us-east-3</test>\n" +
+ " </prod>\n" +
+ " </instance>\n" +
+ "</deployment>\n";
+ var appPackage = ApplicationPackageBuilder.fromDeploymentXml(lengthyDeploymentSpec);
+ var alpha = tester.newDeploymentContext("t", "a", "alpha");
+ var beta = tester.newDeploymentContext("t", "a", "beta");
+ var gamma = tester.newDeploymentContext("t", "a", "gamma");
+ alpha.submit(appPackage).deploy();
+
+ // A version releases, but when the first upgrade has gotten through alpha, beta, and gamma, a newer version has high confidence.
+ var version0 = tester.controller().readSystemVersion();
+ var version1 = new Version("7.1");
+ var version2 = new Version("7.2");
+ tester.controllerTester().upgradeSystem(version1);
+
+ tester.upgrader().maintain();
+ alpha.runJob(systemTest).runJob(stagingTest)
+ .runJob(productionUsEast3).runJob(testUsEast3);
+ assertEquals(Change.empty(), alpha.instance().change());
+
+ tester.upgrader().maintain();
+ beta.runJob(productionUsEast3);
+ tester.controllerTester().upgradeSystem(version2);
+ beta.runJob(testUsEast3);
+ assertEquals(Change.empty(), beta.instance().change());
+
+ tester.upgrader().maintain();
+ assertEquals(Change.of(version2), alpha.instance().change());
+ assertEquals(Change.empty(), beta.instance().change());
+ assertEquals(Change.of(version1), gamma.instance().change());
+ }
+
+ @Test
public void testRevisionJoinsUpgradeWithLeadingRollout() {
var appPackage = new ApplicationPackageBuilder().region("us-central-1")
.region("us-east-3")
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
index 0e61ffd3ea6..265dedec8d0 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java
@@ -35,6 +35,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.deployment.JobTy
import static com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType.systemTest;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.ALL;
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PIN;
+import static com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel.PLATFORM;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -533,7 +534,7 @@ public class UpgraderTest {
}
@Test
- public void testBlockVersionChangeHalfwayThoughThenNewRevision() {
+ public void testBlockVersionChangeHalfwayThroughThenNewRevision() {
// Friday, 16:00
tester.at(Instant.parse("2017-09-29T16:00:00.00Z"));
@@ -541,7 +542,7 @@ public class UpgraderTest {
tester.controllerTester().upgradeSystem(version);
ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
- // Block upgrades on weekends and ouside working hours
+ // Block upgrades on weekends and outside working hours
.blockChange(false, true, "mon-fri", "00-09,17-23", "UTC")
.blockChange(false, true, "sat-sun", "00-23", "UTC")
.region("us-west-1")
@@ -577,11 +578,12 @@ public class UpgraderTest {
app.runJob(stagingTest).triggerJobs().jobAborted(productionUsCentral1); // Retry will include revision.
tester.triggerJobs(); // Triggers us-central-1 before platform upgrade is cancelled.
- // A new version is also released, cancelling the upgrade, since it is failing on a now outdated version.
+ // A new version is also released, and someone cancels the upgrade, suspecting it is faulty.
tester.clock().advance(Duration.ofHours(17));
version = Version.fromString("6.4");
tester.controllerTester().upgradeSystem(version);
tester.upgrader().maintain();
+ tester.deploymentTrigger().cancelChange(app.instanceId(), PLATFORM);
// us-central-1 succeeds upgrade to 6.3, with the revision, but us-east-3 wants to proceed with only the revision change.
app.runJob(productionUsCentral1);